Merge branch 'master' into auto-model-support
This commit is contained in:
commit
36bea177cb
45 changed files with 1526 additions and 179 deletions
14
.github/labeler.yml
vendored
14
.github/labeler.yml
vendored
|
@ -1,5 +1,16 @@
|
||||||
# https://github.com/actions/labeler
|
# https://github.com/actions/labeler
|
||||||
|
Kompute:
|
||||||
|
- changed-files:
|
||||||
|
- any-glob-to-any-file:
|
||||||
|
- ggml-kompute.h
|
||||||
|
- ggml-kompute.cpp
|
||||||
|
- README-kompute.md
|
||||||
|
Apple Metal:
|
||||||
|
- changed-files:
|
||||||
|
- any-glob-to-any-file:
|
||||||
|
- ggml-metal.h
|
||||||
|
- ggml-metal.cpp
|
||||||
|
- README-metal.md
|
||||||
SYCL:
|
SYCL:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
@ -9,6 +20,7 @@ SYCL:
|
||||||
Nvidia GPU:
|
Nvidia GPU:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
- ggml-cuda.h
|
||||||
- ggml-cuda/**
|
- ggml-cuda/**
|
||||||
Vulkan:
|
Vulkan:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
|
|
5
.github/workflows/docker.yml
vendored
5
.github/workflows/docker.yml
vendored
|
@ -42,8 +42,9 @@ jobs:
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
# TODO: Disabled due to build issues https://github.com/ggerganov/llama.cpp/issues/7507
|
||||||
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
#- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
#- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
|
@ -72,6 +72,7 @@ else()
|
||||||
set(INS_ENB ON)
|
set(INS_ENB ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
option(LLAMA_SVE "llama: enable SVE" OFF)
|
||||||
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
|
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
|
||||||
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
|
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
|
||||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||||
|
@ -1040,6 +1041,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
|
||||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_SVE)
|
||||||
|
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||||
|
|
|
@ -203,6 +203,10 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
|
**Tools:**
|
||||||
|
|
||||||
|
- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
Here is a typical run using LLaMA v2 13B on M2 Ultra:
|
||||||
|
|
|
@ -904,6 +904,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
params.interactive_specials = true;
|
params.interactive_specials = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "--special") {
|
||||||
|
params.special = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "--embedding") {
|
if (arg == "--embedding") {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
return true;
|
return true;
|
||||||
|
@ -1362,6 +1366,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
printf(" -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
printf(" --version show version and build info\n");
|
printf(" --version show version and build info\n");
|
||||||
printf(" -i, --interactive run in interactive mode\n");
|
printf(" -i, --interactive run in interactive mode\n");
|
||||||
|
printf(" --special special tokens output enabled\n");
|
||||||
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
|
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
|
||||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
||||||
|
@ -1855,11 +1860,15 @@ bool fs_create_directory_with_parents(const std::string & path) {
|
||||||
|
|
||||||
std::string fs_get_cache_directory() {
|
std::string fs_get_cache_directory() {
|
||||||
std::string cache_directory = "";
|
std::string cache_directory = "";
|
||||||
|
auto ensure_trailing_slash = [](std::string p) {
|
||||||
|
// Make sure to add trailing slash
|
||||||
|
if (p.back() != DIRECTORY_SEPARATOR) {
|
||||||
|
p += DIRECTORY_SEPARATOR;
|
||||||
|
}
|
||||||
|
return p;
|
||||||
|
};
|
||||||
if (getenv("LLAMA_CACHE")) {
|
if (getenv("LLAMA_CACHE")) {
|
||||||
cache_directory = std::getenv("LLAMA_CACHE");
|
cache_directory = std::getenv("LLAMA_CACHE");
|
||||||
if (cache_directory.back() != DIRECTORY_SEPARATOR) {
|
|
||||||
cache_directory += DIRECTORY_SEPARATOR;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
if (std::getenv("XDG_CACHE_HOME")) {
|
if (std::getenv("XDG_CACHE_HOME")) {
|
||||||
|
@ -1870,12 +1879,12 @@ std::string fs_get_cache_directory() {
|
||||||
#elif defined(__APPLE__)
|
#elif defined(__APPLE__)
|
||||||
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
cache_directory = std::getenv("APPDATA");
|
cache_directory = std::getenv("LOCALAPPDATA");
|
||||||
#endif // __linux__
|
#endif // __linux__
|
||||||
|
cache_directory = ensure_trailing_slash(cache_directory);
|
||||||
cache_directory += "llama.cpp";
|
cache_directory += "llama.cpp";
|
||||||
cache_directory += DIRECTORY_SEPARATOR;
|
|
||||||
}
|
}
|
||||||
return cache_directory;
|
return ensure_trailing_slash(cache_directory);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -2840,6 +2849,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
|
||||||
|
fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
|
||||||
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
|
||||||
|
|
|
@ -146,6 +146,7 @@ struct gpt_params {
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
|
||||||
|
bool special = false; // enable special token output
|
||||||
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
|
|
|
@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
|
||||||
|
|
||||||
params.custom_n_ctx = false;
|
params.custom_n_ctx = false;
|
||||||
|
|
||||||
params.use_flash = true;
|
params.use_flash = false;
|
||||||
params.use_checkpointing = true;
|
params.use_checkpointing = true;
|
||||||
|
|
||||||
params.sample_start = "";
|
params.sample_start = "";
|
||||||
|
|
|
@ -81,6 +81,7 @@ models = [
|
||||||
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
|
||||||
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
|
||||||
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
|
||||||
|
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -312,11 +312,10 @@ class Model:
|
||||||
data = data.astype(np.float32)
|
data = data.astype(np.float32)
|
||||||
data_qtype = gguf.GGMLQuantizationType.F32
|
data_qtype = gguf.GGMLQuantizationType.F32
|
||||||
|
|
||||||
block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
|
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
|
||||||
|
|
||||||
# reverse shape to make it similar to the internal ggml dimension order
|
# reverse shape to make it similar to the internal ggml dimension order
|
||||||
shape_str = f"""{{{', '.join(str(n) for n in reversed(
|
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
|
||||||
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
|
|
||||||
)}}}"""
|
|
||||||
|
|
||||||
# n_dims is implicit in the shape
|
# n_dims is implicit in the shape
|
||||||
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||||
|
@ -2342,7 +2341,8 @@ class CommandR2Model(Model):
|
||||||
|
|
||||||
# max_position_embeddings = 8192 in config.json but model was actually
|
# max_position_embeddings = 8192 in config.json but model was actually
|
||||||
# trained on 128k context length
|
# trained on 128k context length
|
||||||
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
|
# aya-23 models don't have model_max_length specified
|
||||||
|
self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
|
||||||
|
|
||||||
def set_gguf_parameters(self):
|
def set_gguf_parameters(self):
|
||||||
super().set_gguf_parameters()
|
super().set_gguf_parameters()
|
||||||
|
@ -2415,6 +2415,157 @@ class JinaBertV2Model(BertModel):
|
||||||
self.gguf_writer.add_add_eos_token(True)
|
self.gguf_writer.add_add_eos_token(True)
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("ArcticForCausalLM")
|
||||||
|
class ArcticModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.ARCTIC
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
# The reason for using a custom implementation here is that the
|
||||||
|
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
|
||||||
|
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
|
tokenizer_path = self.dir_model / 'tokenizer.model'
|
||||||
|
|
||||||
|
if not tokenizer_path.is_file():
|
||||||
|
logger.error(f'Error: Missing {tokenizer_path}')
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Read the whole vocabulary from the tokenizer.model file
|
||||||
|
tokenizer = SentencePieceProcessor()
|
||||||
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||||
|
|
||||||
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
|
||||||
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||||
|
scores: list[float] = [-10000.0] * vocab_size
|
||||||
|
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||||
|
|
||||||
|
for token_id in range(tokenizer.vocab_size()):
|
||||||
|
|
||||||
|
piece = tokenizer.IdToPiece(token_id)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.GetScore(token_id)
|
||||||
|
|
||||||
|
toktype = SentencePieceTokenTypes.NORMAL
|
||||||
|
if tokenizer.IsUnknown(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||||
|
elif tokenizer.IsControl(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.CONTROL
|
||||||
|
elif tokenizer.IsUnused(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNUSED
|
||||||
|
elif tokenizer.IsByte(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.BYTE
|
||||||
|
|
||||||
|
tokens[token_id] = text
|
||||||
|
scores[token_id] = score
|
||||||
|
toktypes[token_id] = toktype
|
||||||
|
|
||||||
|
# Use the added_tokens_decoder field from tokeniser_config.json as the source
|
||||||
|
# of information about added/redefined tokens and modify them accordingly.
|
||||||
|
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
|
||||||
|
if tokenizer_config_file.is_file():
|
||||||
|
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
|
||||||
|
tokenizer_config_json = json.load(f)
|
||||||
|
|
||||||
|
if "added_tokens_decoder" in tokenizer_config_json:
|
||||||
|
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
|
||||||
|
for token_id, token_json in added_tokens_decoder.items():
|
||||||
|
token_id = int(token_id)
|
||||||
|
if (token_id >= vocab_size):
|
||||||
|
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
token_content = token_json["content"]
|
||||||
|
token_type = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
token_score = -10000.0
|
||||||
|
|
||||||
|
# Map unk_token to UNKNOWN, other special tokens to CONTROL
|
||||||
|
# Set the score to 0.0 as in the original tokenizer.model
|
||||||
|
if ("special" in token_json) and token_json["special"]:
|
||||||
|
if token_content == tokenizer_config_json["unk_token"]:
|
||||||
|
token_type = SentencePieceTokenTypes.UNKNOWN
|
||||||
|
else:
|
||||||
|
token_type = SentencePieceTokenTypes.CONTROL
|
||||||
|
token_score = 0.0
|
||||||
|
|
||||||
|
logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
|
||||||
|
tokens[token_id] = token_content.encode("utf-8")
|
||||||
|
toktypes[token_id] = token_type
|
||||||
|
scores[token_id] = token_score
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("llama")
|
||||||
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
hparams = self.hparams
|
||||||
|
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
|
||||||
|
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
||||||
|
|
||||||
|
_experts: list[dict[str, Tensor]] | None = None
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
n_head = self.hparams["num_attention_heads"]
|
||||||
|
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||||
|
|
||||||
|
if name.endswith("q_proj.weight"):
|
||||||
|
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||||
|
if name.endswith("k_proj.weight"):
|
||||||
|
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||||
|
|
||||||
|
# process the experts separately
|
||||||
|
if name.find("block_sparse_moe.experts") != -1:
|
||||||
|
n_experts = self.hparams["num_local_experts"]
|
||||||
|
|
||||||
|
assert bid is not None
|
||||||
|
|
||||||
|
if self._experts is None:
|
||||||
|
self._experts = [{} for _ in range(self.block_count)]
|
||||||
|
|
||||||
|
self._experts[bid][name] = data_torch
|
||||||
|
|
||||||
|
if len(self._experts[bid]) >= n_experts * 3:
|
||||||
|
tensors: list[tuple[str, Tensor]] = []
|
||||||
|
|
||||||
|
# merge the experts into a single 3d tensor
|
||||||
|
for wid in ["w1", "w2", "w3"]:
|
||||||
|
datas: list[Tensor] = []
|
||||||
|
|
||||||
|
for xid in range(n_experts):
|
||||||
|
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
|
||||||
|
datas.append(self._experts[bid][ename])
|
||||||
|
del self._experts[bid][ename]
|
||||||
|
|
||||||
|
data_torch = torch.stack(datas, dim=0)
|
||||||
|
|
||||||
|
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
|
||||||
|
|
||||||
|
new_name = self.map_tensor_name(merged_name)
|
||||||
|
|
||||||
|
tensors.append((new_name, data_torch))
|
||||||
|
return tensors
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
def write_tensors(self):
|
||||||
|
super().write_tensors()
|
||||||
|
|
||||||
|
if self._experts is not None:
|
||||||
|
# flatten `list[dict[str, Tensor]]` into `list[str]`
|
||||||
|
experts = [k for d in self._experts for k in d.keys()]
|
||||||
|
if len(experts) > 0:
|
||||||
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
|
||||||
|
|
||||||
params.samples_start_after_nl = false;
|
params.samples_start_after_nl = false;
|
||||||
params.use_adam = true;
|
params.use_adam = true;
|
||||||
params.use_flash = true;
|
params.use_flash = false;
|
||||||
params.use_scratch = true;
|
params.use_scratch = true;
|
||||||
|
|
||||||
// only adam
|
// only adam
|
||||||
|
|
|
@ -7,8 +7,6 @@ android {
|
||||||
namespace = "com.example.llama"
|
namespace = "com.example.llama"
|
||||||
compileSdk = 34
|
compileSdk = 34
|
||||||
|
|
||||||
ndkVersion = "26.1.10909125"
|
|
||||||
|
|
||||||
defaultConfig {
|
defaultConfig {
|
||||||
applicationId = "com.example.llama"
|
applicationId = "com.example.llama"
|
||||||
minSdk = 33
|
minSdk = 33
|
||||||
|
@ -20,17 +18,6 @@ android {
|
||||||
vectorDrawables {
|
vectorDrawables {
|
||||||
useSupportLibrary = true
|
useSupportLibrary = true
|
||||||
}
|
}
|
||||||
ndk {
|
|
||||||
// Add NDK properties if wanted, e.g.
|
|
||||||
// abiFilters += listOf("arm64-v8a")
|
|
||||||
}
|
|
||||||
externalNativeBuild {
|
|
||||||
cmake {
|
|
||||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
|
||||||
cppFlags += listOf()
|
|
||||||
arguments += listOf()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
buildTypes {
|
buildTypes {
|
||||||
|
@ -55,17 +42,6 @@ android {
|
||||||
composeOptions {
|
composeOptions {
|
||||||
kotlinCompilerExtensionVersion = "1.5.1"
|
kotlinCompilerExtensionVersion = "1.5.1"
|
||||||
}
|
}
|
||||||
packaging {
|
|
||||||
resources {
|
|
||||||
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
externalNativeBuild {
|
|
||||||
cmake {
|
|
||||||
path = file("src/main/cpp/CMakeLists.txt")
|
|
||||||
version = "3.22.1"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
dependencies {
|
dependencies {
|
||||||
|
@ -78,6 +54,7 @@ dependencies {
|
||||||
implementation("androidx.compose.ui:ui-graphics")
|
implementation("androidx.compose.ui:ui-graphics")
|
||||||
implementation("androidx.compose.ui:ui-tooling-preview")
|
implementation("androidx.compose.ui:ui-tooling-preview")
|
||||||
implementation("androidx.compose.material3:material3")
|
implementation("androidx.compose.material3:material3")
|
||||||
|
implementation(project(":llama"))
|
||||||
testImplementation("junit:junit:4.13.2")
|
testImplementation("junit:junit:4.13.2")
|
||||||
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||||
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package com.example.llama
|
package com.example.llama
|
||||||
|
|
||||||
|
import android.llama.cpp.LLamaAndroid
|
||||||
import android.util.Log
|
import android.util.Log
|
||||||
import androidx.compose.runtime.getValue
|
import androidx.compose.runtime.getValue
|
||||||
import androidx.compose.runtime.mutableStateOf
|
import androidx.compose.runtime.mutableStateOf
|
||||||
|
@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope
|
||||||
import kotlinx.coroutines.flow.catch
|
import kotlinx.coroutines.flow.catch
|
||||||
import kotlinx.coroutines.launch
|
import kotlinx.coroutines.launch
|
||||||
|
|
||||||
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
|
||||||
companion object {
|
companion object {
|
||||||
@JvmStatic
|
@JvmStatic
|
||||||
private val NanosPerSecond = 1_000_000_000.0
|
private val NanosPerSecond = 1_000_000_000.0
|
||||||
|
@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||||
|
|
||||||
viewModelScope.launch {
|
viewModelScope.launch {
|
||||||
try {
|
try {
|
||||||
llm.unload()
|
llamaAndroid.unload()
|
||||||
} catch (exc: IllegalStateException) {
|
} catch (exc: IllegalStateException) {
|
||||||
messages += exc.message!!
|
messages += exc.message!!
|
||||||
}
|
}
|
||||||
|
@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||||
messages += ""
|
messages += ""
|
||||||
|
|
||||||
viewModelScope.launch {
|
viewModelScope.launch {
|
||||||
llm.send(text)
|
llamaAndroid.send(text)
|
||||||
.catch {
|
.catch {
|
||||||
Log.e(tag, "send() failed", it)
|
Log.e(tag, "send() failed", it)
|
||||||
messages += it.message!!
|
messages += it.message!!
|
||||||
|
@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||||
viewModelScope.launch {
|
viewModelScope.launch {
|
||||||
try {
|
try {
|
||||||
val start = System.nanoTime()
|
val start = System.nanoTime()
|
||||||
val warmupResult = llm.bench(pp, tg, pl, nr)
|
val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
|
||||||
val end = System.nanoTime()
|
val end = System.nanoTime()
|
||||||
|
|
||||||
messages += warmupResult
|
messages += warmupResult
|
||||||
|
@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||||
return@launch
|
return@launch
|
||||||
}
|
}
|
||||||
|
|
||||||
messages += llm.bench(512, 128, 1, 3)
|
messages += llamaAndroid.bench(512, 128, 1, 3)
|
||||||
} catch (exc: IllegalStateException) {
|
} catch (exc: IllegalStateException) {
|
||||||
Log.e(tag, "bench() failed", exc)
|
Log.e(tag, "bench() failed", exc)
|
||||||
messages += exc.message!!
|
messages += exc.message!!
|
||||||
|
@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
|
||||||
fun load(pathToModel: String) {
|
fun load(pathToModel: String) {
|
||||||
viewModelScope.launch {
|
viewModelScope.launch {
|
||||||
try {
|
try {
|
||||||
llm.load(pathToModel)
|
llamaAndroid.load(pathToModel)
|
||||||
messages += "Loaded $pathToModel"
|
messages += "Loaded $pathToModel"
|
||||||
} catch (exc: IllegalStateException) {
|
} catch (exc: IllegalStateException) {
|
||||||
Log.e(tag, "load() failed", exc)
|
Log.e(tag, "load() failed", exc)
|
||||||
|
|
|
@ -2,4 +2,5 @@
|
||||||
plugins {
|
plugins {
|
||||||
id("com.android.application") version "8.2.0" apply false
|
id("com.android.application") version "8.2.0" apply false
|
||||||
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
|
||||||
|
id("com.android.library") version "8.2.0" apply false
|
||||||
}
|
}
|
||||||
|
|
1
examples/llama.android/llama/.gitignore
vendored
Normal file
1
examples/llama.android/llama/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
/build
|
|
@ -42,7 +42,7 @@ add_subdirectory(../../../../../../ build-llama)
|
||||||
# used in the AndroidManifest.xml file.
|
# used in the AndroidManifest.xml file.
|
||||||
add_library(${CMAKE_PROJECT_NAME} SHARED
|
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||||
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||||
llama-android.cpp)
|
llama-android.cpp)
|
||||||
|
|
||||||
# Specifies libraries CMake should link to your target library. You
|
# Specifies libraries CMake should link to your target library. You
|
||||||
# can link libraries from various origins, such as libraries defined in this
|
# can link libraries from various origins, such as libraries defined in this
|
68
examples/llama.android/llama/build.gradle.kts
Normal file
68
examples/llama.android/llama/build.gradle.kts
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
plugins {
|
||||||
|
id("com.android.library")
|
||||||
|
id("org.jetbrains.kotlin.android")
|
||||||
|
}
|
||||||
|
|
||||||
|
android {
|
||||||
|
namespace = "android.llama.cpp"
|
||||||
|
compileSdk = 34
|
||||||
|
|
||||||
|
defaultConfig {
|
||||||
|
minSdk = 33
|
||||||
|
|
||||||
|
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
|
||||||
|
consumerProguardFiles("consumer-rules.pro")
|
||||||
|
ndk {
|
||||||
|
// Add NDK properties if wanted, e.g.
|
||||||
|
// abiFilters += listOf("arm64-v8a")
|
||||||
|
}
|
||||||
|
externalNativeBuild {
|
||||||
|
cmake {
|
||||||
|
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||||
|
cppFlags += listOf()
|
||||||
|
arguments += listOf()
|
||||||
|
|
||||||
|
cppFlags("")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
buildTypes {
|
||||||
|
release {
|
||||||
|
isMinifyEnabled = false
|
||||||
|
proguardFiles(
|
||||||
|
getDefaultProguardFile("proguard-android-optimize.txt"),
|
||||||
|
"proguard-rules.pro"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
externalNativeBuild {
|
||||||
|
cmake {
|
||||||
|
path("src/main/cpp/CMakeLists.txt")
|
||||||
|
version = "3.22.1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
compileOptions {
|
||||||
|
sourceCompatibility = JavaVersion.VERSION_1_8
|
||||||
|
targetCompatibility = JavaVersion.VERSION_1_8
|
||||||
|
}
|
||||||
|
kotlinOptions {
|
||||||
|
jvmTarget = "1.8"
|
||||||
|
}
|
||||||
|
|
||||||
|
packaging {
|
||||||
|
resources {
|
||||||
|
excludes += "/META-INF/{AL2.0,LGPL2.1}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
dependencies {
|
||||||
|
|
||||||
|
implementation("androidx.core:core-ktx:1.12.0")
|
||||||
|
implementation("androidx.appcompat:appcompat:1.6.1")
|
||||||
|
implementation("com.google.android.material:material:1.11.0")
|
||||||
|
testImplementation("junit:junit:4.13.2")
|
||||||
|
androidTestImplementation("androidx.test.ext:junit:1.1.5")
|
||||||
|
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
|
||||||
|
}
|
0
examples/llama.android/llama/consumer-rules.pro
Normal file
0
examples/llama.android/llama/consumer-rules.pro
Normal file
21
examples/llama.android/llama/proguard-rules.pro
vendored
Normal file
21
examples/llama.android/llama/proguard-rules.pro
vendored
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
# Add project specific ProGuard rules here.
|
||||||
|
# You can control the set of applied configuration files using the
|
||||||
|
# proguardFiles setting in build.gradle.
|
||||||
|
#
|
||||||
|
# For more details, see
|
||||||
|
# http://developer.android.com/guide/developing/tools/proguard.html
|
||||||
|
|
||||||
|
# If your project uses WebView with JS, uncomment the following
|
||||||
|
# and specify the fully qualified class name to the JavaScript interface
|
||||||
|
# class:
|
||||||
|
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
|
||||||
|
# public *;
|
||||||
|
#}
|
||||||
|
|
||||||
|
# Uncomment this to preserve the line number information for
|
||||||
|
# debugging stack traces.
|
||||||
|
#-keepattributes SourceFile,LineNumberTable
|
||||||
|
|
||||||
|
# If you keep the line number information, uncomment this to
|
||||||
|
# hide the original source file name.
|
||||||
|
#-renamesourcefileattribute SourceFile
|
|
@ -0,0 +1,24 @@
|
||||||
|
package android.llama.cpp
|
||||||
|
|
||||||
|
import androidx.test.platform.app.InstrumentationRegistry
|
||||||
|
import androidx.test.ext.junit.runners.AndroidJUnit4
|
||||||
|
|
||||||
|
import org.junit.Test
|
||||||
|
import org.junit.runner.RunWith
|
||||||
|
|
||||||
|
import org.junit.Assert.*
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Instrumented test, which will execute on an Android device.
|
||||||
|
*
|
||||||
|
* See [testing documentation](http://d.android.com/tools/testing).
|
||||||
|
*/
|
||||||
|
@RunWith(AndroidJUnit4::class)
|
||||||
|
class ExampleInstrumentedTest {
|
||||||
|
@Test
|
||||||
|
fun useAppContext() {
|
||||||
|
// Context of the app under test.
|
||||||
|
val appContext = InstrumentationRegistry.getInstrumentation().targetContext
|
||||||
|
assertEquals("android.llama.cpp.test", appContext.packageName)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,4 @@
|
||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
|
||||||
|
|
||||||
|
</manifest>
|
49
examples/llama.android/llama/src/main/cpp/CMakeLists.txt
Normal file
49
examples/llama.android/llama/src/main/cpp/CMakeLists.txt
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
# For more information about using CMake with Android Studio, read the
|
||||||
|
# documentation: https://d.android.com/studio/projects/add-native-code.html.
|
||||||
|
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
|
||||||
|
|
||||||
|
# Sets the minimum CMake version required for this project.
|
||||||
|
cmake_minimum_required(VERSION 3.22.1)
|
||||||
|
|
||||||
|
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
|
||||||
|
# Since this is the top level CMakeLists.txt, the project name is also accessible
|
||||||
|
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
|
||||||
|
# build script scope).
|
||||||
|
project("llama-android")
|
||||||
|
|
||||||
|
include(FetchContent)
|
||||||
|
FetchContent_Declare(
|
||||||
|
llama
|
||||||
|
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
|
||||||
|
GIT_TAG master
|
||||||
|
)
|
||||||
|
|
||||||
|
# Also provides "common"
|
||||||
|
FetchContent_MakeAvailable(llama)
|
||||||
|
|
||||||
|
# Creates and names a library, sets it as either STATIC
|
||||||
|
# or SHARED, and provides the relative paths to its source code.
|
||||||
|
# You can define multiple libraries, and CMake builds them for you.
|
||||||
|
# Gradle automatically packages shared libraries with your APK.
|
||||||
|
#
|
||||||
|
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
|
||||||
|
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
|
||||||
|
# is preferred for the same purpose.
|
||||||
|
#
|
||||||
|
# In order to load a library into your app from Java/Kotlin, you must call
|
||||||
|
# System.loadLibrary() and pass the name of the library defined here;
|
||||||
|
# for GameActivity/NativeActivity derived applications, the same library name must be
|
||||||
|
# used in the AndroidManifest.xml file.
|
||||||
|
add_library(${CMAKE_PROJECT_NAME} SHARED
|
||||||
|
# List C/C++ source files with relative paths to this CMakeLists.txt.
|
||||||
|
llama-android.cpp)
|
||||||
|
|
||||||
|
# Specifies libraries CMake should link to your target library. You
|
||||||
|
# can link libraries from various origins, such as libraries defined in this
|
||||||
|
# build script, prebuilt third-party libraries, or Android system libraries.
|
||||||
|
target_link_libraries(${CMAKE_PROJECT_NAME}
|
||||||
|
# List libraries link to the target library
|
||||||
|
llama
|
||||||
|
common
|
||||||
|
android
|
||||||
|
log)
|
|
@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jlong JNICALL
|
JNIEXPORT jlong JNICALL
|
||||||
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
|
||||||
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
auto path_to_model = env->GetStringUTFChars(filename, 0);
|
||||||
|
@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
|
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
|
||||||
llama_free_model(reinterpret_cast<llama_model *>(model));
|
llama_free_model(reinterpret_cast<llama_model *>(model));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jlong JNICALL
|
JNIEXPORT jlong JNICALL
|
||||||
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||||
auto model = reinterpret_cast<llama_model *>(jmodel);
|
auto model = reinterpret_cast<llama_model *>(jmodel);
|
||||||
|
|
||||||
if (!model) {
|
if (!model) {
|
||||||
|
@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
|
Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
|
||||||
llama_free(reinterpret_cast<llama_context *>(context));
|
llama_free(reinterpret_cast<llama_context *>(context));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
|
Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
|
Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
|
||||||
llama_log_set(log_callback, NULL);
|
llama_log_set(log_callback, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jstring JNICALL
|
JNIEXPORT jstring JNICALL
|
||||||
Java_com_example_llama_Llm_bench_1model(
|
Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||||
JNIEnv *env,
|
JNIEnv *env,
|
||||||
jobject,
|
jobject,
|
||||||
jlong context_pointer,
|
jlong context_pointer,
|
||||||
|
@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
|
||||||
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jlong JNICALL
|
JNIEXPORT jlong JNICALL
|
||||||
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
|
||||||
|
|
||||||
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
|
||||||
|
|
||||||
|
@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
|
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jstring JNICALL
|
JNIEXPORT jstring JNICALL
|
||||||
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
|
Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
|
||||||
return env->NewStringUTF(llama_print_system_info());
|
return env->NewStringUTF(llama_print_system_info());
|
||||||
}
|
}
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jint JNICALL
|
JNIEXPORT jint JNICALL
|
||||||
Java_com_example_llama_Llm_completion_1init(
|
Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||||
JNIEnv *env,
|
JNIEnv *env,
|
||||||
jobject,
|
jobject,
|
||||||
jlong context_pointer,
|
jlong context_pointer,
|
||||||
|
@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT jstring JNICALL
|
JNIEXPORT jstring JNICALL
|
||||||
Java_com_example_llama_Llm_completion_1loop(
|
Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||||
JNIEnv * env,
|
JNIEnv * env,
|
||||||
jobject,
|
jobject,
|
||||||
jlong context_pointer,
|
jlong context_pointer,
|
||||||
|
@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
|
||||||
|
|
||||||
extern "C"
|
extern "C"
|
||||||
JNIEXPORT void JNICALL
|
JNIEXPORT void JNICALL
|
||||||
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
|
||||||
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
|
||||||
}
|
}
|
|
@ -1,4 +1,4 @@
|
||||||
package com.example.llama
|
package android.llama.cpp
|
||||||
|
|
||||||
import android.util.Log
|
import android.util.Log
|
||||||
import kotlinx.coroutines.CoroutineDispatcher
|
import kotlinx.coroutines.CoroutineDispatcher
|
||||||
|
@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
|
||||||
import java.util.concurrent.Executors
|
import java.util.concurrent.Executors
|
||||||
import kotlin.concurrent.thread
|
import kotlin.concurrent.thread
|
||||||
|
|
||||||
class Llm {
|
class LLamaAndroid {
|
||||||
private val tag: String? = this::class.simpleName
|
private val tag: String? = this::class.simpleName
|
||||||
|
|
||||||
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
|
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
|
||||||
|
@ -165,8 +165,8 @@ class Llm {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enforce only one instance of Llm.
|
// Enforce only one instance of Llm.
|
||||||
private val _instance: Llm = Llm()
|
private val _instance: LLamaAndroid = LLamaAndroid()
|
||||||
|
|
||||||
fun instance(): Llm = _instance
|
fun instance(): LLamaAndroid = _instance
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
package android.llama.cpp
|
||||||
|
|
||||||
|
import org.junit.Test
|
||||||
|
|
||||||
|
import org.junit.Assert.*
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Example local unit test, which will execute on the development machine (host).
|
||||||
|
*
|
||||||
|
* See [testing documentation](http://d.android.com/tools/testing).
|
||||||
|
*/
|
||||||
|
class ExampleUnitTest {
|
||||||
|
@Test
|
||||||
|
fun addition_isCorrect() {
|
||||||
|
assertEquals(4, 2 + 2)
|
||||||
|
}
|
||||||
|
}
|
|
@ -15,3 +15,4 @@ dependencyResolutionManagement {
|
||||||
|
|
||||||
rootProject.name = "LlamaAndroid"
|
rootProject.name = "LlamaAndroid"
|
||||||
include(":app")
|
include(":app")
|
||||||
|
include(":llama")
|
||||||
|
|
|
@ -740,18 +740,26 @@ int main(int argc, char ** argv) {
|
||||||
// display text
|
// display text
|
||||||
if (input_echo && display) {
|
if (input_echo && display) {
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
|
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
||||||
printf("%s", token_str.c_str());
|
|
||||||
|
|
||||||
|
// Console/Stream Output
|
||||||
|
fprintf(stdout, "%s", token_str.c_str());
|
||||||
|
|
||||||
|
// Record Displayed Tokens To Log
|
||||||
|
// Note: Generated tokens are created one by one hence this check
|
||||||
if (embd.size() > 1) {
|
if (embd.size() > 1) {
|
||||||
|
// Incoming Requested Tokens
|
||||||
input_tokens.push_back(id);
|
input_tokens.push_back(id);
|
||||||
} else {
|
} else {
|
||||||
|
// Outgoing Generated Tokens
|
||||||
output_tokens.push_back(id);
|
output_tokens.push_back(id);
|
||||||
output_ss << token_str;
|
output_ss << token_str;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// reset color to default if there is no pending user input
|
// reset color to default if there is no pending user input
|
||||||
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
if (input_echo && (int) embd_inp.size() == n_consumed) {
|
||||||
console::set_display(console::reset);
|
console::set_display(console::reset);
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
<!DOCTYPE html>
|
<!DOCTYPE html>
|
||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
<title>SimpleChat (LlamaCPP, ...) </title>
|
<title>SimpleChat LlamaCppEtal </title>
|
||||||
<meta charset="UTF-8" />
|
<meta charset="UTF-8" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
<meta name="message" content="Save Nature Save Earth" />
|
<meta name="message" content="Save Nature Save Earth" />
|
||||||
|
@ -30,20 +30,17 @@
|
||||||
<hr>
|
<hr>
|
||||||
<div class="sameline">
|
<div class="sameline">
|
||||||
<label for="system-in">System</label>
|
<label for="system-in">System</label>
|
||||||
<input type="text" name="system" id="system-in" class="flex-grow"/>
|
<input type="text" name="system" id="system-in" placeholder="e.g. you are a helpful ai assistant, who provides concise answers" class="flex-grow"/>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<hr>
|
<hr>
|
||||||
<div id="chat-div">
|
<div id="chat-div">
|
||||||
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
<p> You need to have javascript enabled.</p>
|
||||||
<p> Enter your text to the ai assistant below.</p>
|
|
||||||
<p> Use shift+enter for inserting enter.</p>
|
|
||||||
<p> Refresh the page to start over fresh.</p>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<hr>
|
<hr>
|
||||||
<div class="sameline">
|
<div class="sameline">
|
||||||
<textarea id="user-in" class="flex-grow" rows="3"></textarea>
|
<textarea id="user-in" class="flex-grow" rows="3" placeholder="enter your query to the ai model here" ></textarea>
|
||||||
<button id="user-btn">submit</button>
|
<button id="user-btn">submit</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
|
@ -14,11 +14,15 @@ own system prompts.
|
||||||
The UI follows a responsive web design so that the layout can adapt to available display space in a usable
|
The UI follows a responsive web design so that the layout can adapt to available display space in a usable
|
||||||
enough manner, in general.
|
enough manner, in general.
|
||||||
|
|
||||||
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
|
||||||
culling of old messages from the chat.
|
console.
|
||||||
|
|
||||||
NOTE: It doesnt set any parameters other than temperature for now. However if someone wants they can update
|
NOTE: Given that the idea is for basic minimal testing, it doesnt bother with any model context length and
|
||||||
the js file as needed.
|
culling of old messages from the chat by default. However by enabling the sliding window chat logic, a crude
|
||||||
|
form of old messages culling can be achieved.
|
||||||
|
|
||||||
|
NOTE: It doesnt set any parameters other than temperature and max_tokens for now. However if someone wants
|
||||||
|
they can update the js file or equivalent member in gMe as needed.
|
||||||
|
|
||||||
|
|
||||||
## usage
|
## usage
|
||||||
|
@ -43,11 +47,33 @@ next run this web front end in examples/server/public_simplechat
|
||||||
### using the front end
|
### using the front end
|
||||||
|
|
||||||
Open this simple web front end from your local browser
|
Open this simple web front end from your local browser
|
||||||
|
|
||||||
* http://127.0.0.1:PORT/index.html
|
* http://127.0.0.1:PORT/index.html
|
||||||
|
|
||||||
Once inside
|
Once inside
|
||||||
|
|
||||||
* Select between chat and completion mode. By default it is set to chat mode.
|
* Select between chat and completion mode. By default it is set to chat mode.
|
||||||
|
|
||||||
|
* In completion mode
|
||||||
|
* logic by default doesnt insert any role specific "ROLE: " prefix wrt each role's message.
|
||||||
|
If the model requires any prefix wrt user role messages, then the end user has to
|
||||||
|
explicitly add the needed prefix, when they enter their chat message.
|
||||||
|
Similarly if the model requires any prefix to trigger assistant/ai-model response,
|
||||||
|
then the end user needs to enter the same.
|
||||||
|
This keeps the logic simple, while still giving flexibility to the end user to
|
||||||
|
manage any templating/tagging requirement wrt their messages to the model.
|
||||||
|
* the logic doesnt insert newline at the begining and end wrt the prompt message generated.
|
||||||
|
However if the chat being sent to /completions end point has more than one role's message,
|
||||||
|
then insert newline when moving from one role's message to the next role's message, so
|
||||||
|
that it can be clearly identified/distinguished.
|
||||||
|
* given that /completions endpoint normally doesnt add additional chat-templating of its
|
||||||
|
own, the above ensures that end user can create a custom single/multi message combo with
|
||||||
|
any tags/special-tokens related chat templating to test out model handshake. Or enduser
|
||||||
|
can use it just for normal completion related/based query.
|
||||||
|
|
||||||
* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
|
* If you want to provide a system prompt, then ideally enter it first, before entering any user query.
|
||||||
|
Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
|
||||||
|
responses with a suitable system prompt.
|
||||||
* if chat.add_system_begin is used
|
* if chat.add_system_begin is used
|
||||||
* you cant change the system prompt, after it is has been submitted once along with user query.
|
* you cant change the system prompt, after it is has been submitted once along with user query.
|
||||||
* you cant set a system prompt, after you have submitted any user query
|
* you cant set a system prompt, after you have submitted any user query
|
||||||
|
@ -55,27 +81,121 @@ Once inside
|
||||||
* one can change the system prompt any time during chat, by changing the contents of system prompt.
|
* one can change the system prompt any time during chat, by changing the contents of system prompt.
|
||||||
* inturn the updated/changed system prompt will be inserted into the chat session.
|
* inturn the updated/changed system prompt will be inserted into the chat session.
|
||||||
* this allows for the subsequent user chatting to be driven by the new system prompt set above.
|
* this allows for the subsequent user chatting to be driven by the new system prompt set above.
|
||||||
|
|
||||||
* Enter your query and either press enter or click on the submit button.
|
* Enter your query and either press enter or click on the submit button.
|
||||||
If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
|
If you want to insert enter (\n) as part of your chat/query to ai model, use shift+enter.
|
||||||
|
|
||||||
* Wait for the logic to communicate with the server and get the response.
|
* Wait for the logic to communicate with the server and get the response.
|
||||||
* the user is not allowed to enter any fresh query during this time.
|
* the user is not allowed to enter any fresh query during this time.
|
||||||
* the user input box will be disabled and a working message will be shown in it.
|
* the user input box will be disabled and a working message will be shown in it.
|
||||||
|
|
||||||
* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
|
* just refresh the page, to reset wrt the chat history and or system prompt and start afresh.
|
||||||
|
|
||||||
* Using NewChat one can start independent chat sessions.
|
* Using NewChat one can start independent chat sessions.
|
||||||
* two independent chat sessions are setup by default.
|
* two independent chat sessions are setup by default.
|
||||||
|
|
||||||
|
|
||||||
## Devel note
|
## Devel note
|
||||||
|
|
||||||
|
### Reason behind this
|
||||||
|
|
||||||
|
The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
|
||||||
|
by developers who may not be from web frontend background (so inturn may not be familiar with template /
|
||||||
|
end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
|
||||||
|
|
||||||
|
And given that the idea is also to help explore/experiment for developers, some flexibility is provided
|
||||||
|
to change behaviour easily using the devel-tools/console, for now. And skeletal logic has been implemented
|
||||||
|
to explore some of the end points and ideas/implications around them.
|
||||||
|
|
||||||
|
|
||||||
|
### General
|
||||||
|
|
||||||
|
Me/gMe consolidates the settings which control the behaviour into one object.
|
||||||
|
One can see the current settings, as well as change/update them using browsers devel-tool/console.
|
||||||
|
|
||||||
|
bCompletionFreshChatAlways - whether Completion mode collates complete/sliding-window history when
|
||||||
|
communicating with the server or only sends the latest user query/message.
|
||||||
|
|
||||||
|
bCompletionInsertStandardRolePrefix - whether Completion mode inserts role related prefix wrt the
|
||||||
|
messages that get inserted into prompt field wrt /Completion endpoint.
|
||||||
|
|
||||||
|
chatRequestOptions - maintains the list of options/fields to send along with chat request,
|
||||||
|
irrespective of whether /chat/completions or /completions endpoint.
|
||||||
|
|
||||||
|
If you want to add additional options/fields to send to the server/ai-model, and or
|
||||||
|
modify the existing options value or remove them, for now you can update this global var
|
||||||
|
using browser's development-tools/console.
|
||||||
|
|
||||||
|
iRecentUserMsgCnt - a simple minded SlidingWindow to limit context window load at Ai Model end.
|
||||||
|
This is disabled by default. However if enabled, then in addition to latest system message, only
|
||||||
|
the last/latest iRecentUserMsgCnt user messages after the latest system prompt and its responses
|
||||||
|
from the ai model will be sent to the ai-model, when querying for a new response. IE if enabled,
|
||||||
|
only user messages after the latest system message/prompt will be considered.
|
||||||
|
|
||||||
|
This specified sliding window user message count also includes the latest user query.
|
||||||
|
<0 : Send entire chat history to server
|
||||||
|
0 : Send only the system message if any to the server
|
||||||
|
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
||||||
|
|
||||||
|
|
||||||
|
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
|
||||||
|
implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||||
|
some extent in a simple crude way.
|
||||||
|
|
||||||
|
|
||||||
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||||
may not be visible. Also remember that just refreshing/reloading page in browser or for that
|
may not be visible. Also remember that just refreshing/reloading page in browser or for that
|
||||||
matter clearing site data, dont directly override site caching in all cases. Worst case you may
|
matter clearing site data, dont directly override site caching in all cases. Worst case you may
|
||||||
have to change port. Or in dev tools of browser, you may be able to disable caching fully.
|
have to change port. Or in dev tools of browser, you may be able to disable caching fully.
|
||||||
|
|
||||||
|
|
||||||
Concept of multiple chat sessions with different servers, as well as saving and restoring of
|
Concept of multiple chat sessions with different servers, as well as saving and restoring of
|
||||||
those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
|
those across browser usage sessions, can be woven around the SimpleChat/MultiChatUI class and
|
||||||
its instances relatively easily, however given the current goal of keeping this simple, it has
|
its instances relatively easily, however given the current goal of keeping this simple, it has
|
||||||
not been added, for now.
|
not been added, for now.
|
||||||
|
|
||||||
|
|
||||||
By switching between chat.add_system_begin/anytime, one can control whether one can change
|
By switching between chat.add_system_begin/anytime, one can control whether one can change
|
||||||
the system prompt, anytime during the conversation or only at the beginning.
|
the system prompt, anytime during the conversation or only at the beginning.
|
||||||
|
|
||||||
|
|
||||||
|
read_json_early, is to experiment with reading json response data early on, if available,
|
||||||
|
so that user can be shown generated data, as and when it is being generated, rather than
|
||||||
|
at the end when full data is available.
|
||||||
|
|
||||||
|
the server flow doesnt seem to be sending back data early, atleast for request (inc options)
|
||||||
|
that is currently sent.
|
||||||
|
|
||||||
|
if able to read json data early on in future, as and when ai model is generating data, then
|
||||||
|
this helper needs to indirectly update the chat div with the recieved data, without waiting
|
||||||
|
for the overall data to be available.
|
||||||
|
|
||||||
|
|
||||||
|
### Default setup
|
||||||
|
|
||||||
|
By default things are setup to try and make the user experience a bit better, if possible.
|
||||||
|
However a developer when testing the server of ai-model may want to change these value.
|
||||||
|
|
||||||
|
Using iRecentUserMsgCnt reduce chat history context sent to the server/ai-model to be
|
||||||
|
just the system-prompt, prev-user-request-and-ai-response and cur-user-request, instead of
|
||||||
|
full chat history. This way if there is any response with garbage/repeatation, it doesnt
|
||||||
|
mess with things beyond the next question/request/query, in some ways.
|
||||||
|
|
||||||
|
Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
|
||||||
|
available wrt next query-response. However dont forget that the server when started should
|
||||||
|
also be started with a model context size of 1k or more, to be on safe side.
|
||||||
|
|
||||||
|
The /completions endpoint of examples/server doesnt take max_tokens, instead it takes the
|
||||||
|
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
||||||
|
to /completions endpoint handling code on server side.
|
||||||
|
|
||||||
|
Frequency and presence penalty fields are set to 1.2 in the set of fields sent to server
|
||||||
|
along with the user query. So that the model is partly set to try avoid repeating text in
|
||||||
|
its response.
|
||||||
|
|
||||||
|
A end-user can change these behaviour by editing gMe from browser's devel-tool/console.
|
||||||
|
|
||||||
|
|
||||||
|
## At the end
|
||||||
|
|
||||||
|
Also a thank you to all open source and open model developers, who strive for the common good.
|
||||||
|
|
|
@ -48,6 +48,13 @@ button {
|
||||||
flex-direction: column;
|
flex-direction: column;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.ul1 {
|
||||||
|
padding-inline-start: 2vw;
|
||||||
|
}
|
||||||
|
.ul2 {
|
||||||
|
padding-inline-start: 2vw;
|
||||||
|
}
|
||||||
|
|
||||||
* {
|
* {
|
||||||
margin: 0.6vmin;
|
margin: 0.6vmin;
|
||||||
}
|
}
|
||||||
|
|
|
@ -14,23 +14,86 @@ class ApiEP {
|
||||||
}
|
}
|
||||||
|
|
||||||
let gUsageMsg = `
|
let gUsageMsg = `
|
||||||
<p> Enter the system prompt above, before entering/submitting any user query.</p>
|
<p class="role-system">Usage</p>
|
||||||
<p> Enter your text to the ai assistant below.</p>
|
<ul class="ul1">
|
||||||
<p> Use shift+enter for inserting enter.</p>
|
<li> Set system prompt above, to try control ai response charactersitic, if model supports same.</li>
|
||||||
<p> Refresh the page to start over fresh.</p>
|
<ul class="ul2">
|
||||||
|
<li> Completion mode normally wont have a system prompt.</li>
|
||||||
|
</ul>
|
||||||
|
<li> Enter your query to ai assistant below.</li>
|
||||||
|
<ul class="ul2">
|
||||||
|
<li> Completion mode doesnt insert user/role: prefix implicitly.</li>
|
||||||
|
<li> Use shift+enter for inserting enter/newline.</li>
|
||||||
|
</ul>
|
||||||
|
<li> Default ContextWindow = [System, Last Query+Resp, Cur Query].</li>
|
||||||
|
<ul class="ul2">
|
||||||
|
<li> experiment iRecentUserMsgCnt, max_tokens, model ctxt window to expand</li>
|
||||||
|
</ul>
|
||||||
|
</ul>
|
||||||
`;
|
`;
|
||||||
|
|
||||||
|
/** @typedef {{role: string, content: string}[]} ChatMessages */
|
||||||
|
|
||||||
class SimpleChat {
|
class SimpleChat {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
/**
|
/**
|
||||||
* Maintain in a form suitable for common LLM web service chat/completions' messages entry
|
* Maintain in a form suitable for common LLM web service chat/completions' messages entry
|
||||||
* @type {{role: string, content: string}[]}
|
* @type {ChatMessages}
|
||||||
*/
|
*/
|
||||||
this.xchat = [];
|
this.xchat = [];
|
||||||
this.iLastSys = -1;
|
this.iLastSys = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
clear() {
|
||||||
|
this.xchat = [];
|
||||||
|
this.iLastSys = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Recent chat messages.
|
||||||
|
* If iRecentUserMsgCnt < 0
|
||||||
|
* Then return the full chat history
|
||||||
|
* Else
|
||||||
|
* Return chat messages from latest going back till the last/latest system prompt.
|
||||||
|
* While keeping track that the number of user queries/messages doesnt exceed iRecentUserMsgCnt.
|
||||||
|
* @param {number} iRecentUserMsgCnt
|
||||||
|
*/
|
||||||
|
recent_chat(iRecentUserMsgCnt) {
|
||||||
|
if (iRecentUserMsgCnt < 0) {
|
||||||
|
return this.xchat;
|
||||||
|
}
|
||||||
|
if (iRecentUserMsgCnt == 0) {
|
||||||
|
console.warn("WARN:SimpleChat:SC:RecentChat:iRecentUsermsgCnt of 0 means no user message/query sent");
|
||||||
|
}
|
||||||
|
/** @type{ChatMessages} */
|
||||||
|
let rchat = [];
|
||||||
|
let sysMsg = this.get_system_latest();
|
||||||
|
if (sysMsg.length != 0) {
|
||||||
|
rchat.push({role: Roles.System, content: sysMsg});
|
||||||
|
}
|
||||||
|
let iUserCnt = 0;
|
||||||
|
let iStart = this.xchat.length;
|
||||||
|
for(let i=this.xchat.length-1; i > this.iLastSys; i--) {
|
||||||
|
if (iUserCnt >= iRecentUserMsgCnt) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let msg = this.xchat[i];
|
||||||
|
if (msg.role == Roles.User) {
|
||||||
|
iStart = i;
|
||||||
|
iUserCnt += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for(let i = iStart; i < this.xchat.length; i++) {
|
||||||
|
let msg = this.xchat[i];
|
||||||
|
if (msg.role == Roles.System) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
rchat.push({role: msg.role, content: msg.content});
|
||||||
|
}
|
||||||
|
return rchat;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add an entry into xchat
|
* Add an entry into xchat
|
||||||
* @param {string} role
|
* @param {string} role
|
||||||
|
@ -57,7 +120,7 @@ class SimpleChat {
|
||||||
div.replaceChildren();
|
div.replaceChildren();
|
||||||
}
|
}
|
||||||
let last = undefined;
|
let last = undefined;
|
||||||
for(const x of this.xchat) {
|
for(const x of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||||
let entry = document.createElement("p");
|
let entry = document.createElement("p");
|
||||||
entry.className = `role-${x.role}`;
|
entry.className = `role-${x.role}`;
|
||||||
entry.innerText = `${x.role}: ${x.content}`;
|
entry.innerText = `${x.role}: ${x.content}`;
|
||||||
|
@ -69,17 +132,21 @@ class SimpleChat {
|
||||||
} else {
|
} else {
|
||||||
if (bClear) {
|
if (bClear) {
|
||||||
div.innerHTML = gUsageMsg;
|
div.innerHTML = gUsageMsg;
|
||||||
|
gMe.show_info(div);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint
|
* Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||||
|
* The needed fields/options are picked from a global object.
|
||||||
* Convert the json into string.
|
* Convert the json into string.
|
||||||
* @param {Object} obj
|
* @param {Object} obj
|
||||||
*/
|
*/
|
||||||
request_jsonstr(obj) {
|
request_jsonstr(obj) {
|
||||||
obj["temperature"] = 0.7;
|
for(let k in gMe.chatRequestOptions) {
|
||||||
|
obj[k] = gMe.chatRequestOptions[k];
|
||||||
|
}
|
||||||
return JSON.stringify(obj);
|
return JSON.stringify(obj);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -88,18 +155,27 @@ class SimpleChat {
|
||||||
*/
|
*/
|
||||||
request_messages_jsonstr() {
|
request_messages_jsonstr() {
|
||||||
let req = {
|
let req = {
|
||||||
messages: this.xchat,
|
messages: this.recent_chat(gMe.iRecentUserMsgCnt),
|
||||||
}
|
}
|
||||||
return this.request_jsonstr(req);
|
return this.request_jsonstr(req);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return a string form of json object suitable for /completions
|
* Return a string form of json object suitable for /completions
|
||||||
|
* @param {boolean} bInsertStandardRolePrefix Insert "<THE_ROLE>: " as prefix wrt each role's message
|
||||||
*/
|
*/
|
||||||
request_prompt_jsonstr() {
|
request_prompt_jsonstr(bInsertStandardRolePrefix) {
|
||||||
let prompt = "";
|
let prompt = "";
|
||||||
for(const chat of this.xchat) {
|
let iCnt = 0;
|
||||||
prompt += `${chat.role}: ${chat.content}\n`;
|
for(const chat of this.recent_chat(gMe.iRecentUserMsgCnt)) {
|
||||||
|
iCnt += 1;
|
||||||
|
if (iCnt > 1) {
|
||||||
|
prompt += "\n";
|
||||||
|
}
|
||||||
|
if (bInsertStandardRolePrefix) {
|
||||||
|
prompt += `${chat.role}: `;
|
||||||
|
}
|
||||||
|
prompt += `${chat.content}`;
|
||||||
}
|
}
|
||||||
let req = {
|
let req = {
|
||||||
prompt: prompt,
|
prompt: prompt,
|
||||||
|
@ -171,7 +247,6 @@ let gChatURL = {
|
||||||
'chat': `${gBaseURL}/chat/completions`,
|
'chat': `${gBaseURL}/chat/completions`,
|
||||||
'completion': `${gBaseURL}/completions`,
|
'completion': `${gBaseURL}/completions`,
|
||||||
}
|
}
|
||||||
const gbCompletionFreshChatAlways = true;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -291,6 +366,8 @@ class MultiChatUI {
|
||||||
// allow user to insert enter into their message using shift+enter.
|
// allow user to insert enter into their message using shift+enter.
|
||||||
// while just pressing enter key will lead to submitting.
|
// while just pressing enter key will lead to submitting.
|
||||||
if ((ev.key === "Enter") && (!ev.shiftKey)) {
|
if ((ev.key === "Enter") && (!ev.shiftKey)) {
|
||||||
|
let value = this.elInUser.value;
|
||||||
|
this.elInUser.value = value.substring(0,value.length-1);
|
||||||
this.elBtnUser.click();
|
this.elBtnUser.click();
|
||||||
ev.preventDefault();
|
ev.preventDefault();
|
||||||
}
|
}
|
||||||
|
@ -321,6 +398,29 @@ class MultiChatUI {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Try read json response early, if available.
|
||||||
|
* @param {Response} resp
|
||||||
|
*/
|
||||||
|
async read_json_early(resp) {
|
||||||
|
if (!resp.body) {
|
||||||
|
throw Error("ERRR:SimpleChat:MCUI:ReadJsonEarly:No body...");
|
||||||
|
}
|
||||||
|
let tdUtf8 = new TextDecoder("utf-8");
|
||||||
|
let rr = resp.body.getReader();
|
||||||
|
let gotBody = "";
|
||||||
|
while(true) {
|
||||||
|
let { value: cur, done: done} = await rr.read();
|
||||||
|
let curBody = tdUtf8.decode(cur);
|
||||||
|
console.debug("DBUG:SC:PART:", curBody);
|
||||||
|
gotBody += curBody;
|
||||||
|
if (done) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return JSON.parse(gotBody);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Handle user query submit request, wrt specified chat session.
|
* Handle user query submit request, wrt specified chat session.
|
||||||
* @param {string} chatId
|
* @param {string} chatId
|
||||||
|
@ -330,6 +430,14 @@ class MultiChatUI {
|
||||||
|
|
||||||
let chat = this.simpleChats[chatId];
|
let chat = this.simpleChats[chatId];
|
||||||
|
|
||||||
|
// In completion mode, if configured, clear any previous chat history.
|
||||||
|
// So if user wants to simulate a multi-chat based completion query,
|
||||||
|
// they will have to enter the full thing, as a suitable multiline
|
||||||
|
// user input/query.
|
||||||
|
if ((apiEP == ApiEP.Completion) && (gMe.bCompletionFreshChatAlways)) {
|
||||||
|
chat.clear();
|
||||||
|
}
|
||||||
|
|
||||||
chat.add_system_anytime(this.elInSystem.value, chatId);
|
chat.add_system_anytime(this.elInSystem.value, chatId);
|
||||||
|
|
||||||
let content = this.elInUser.value;
|
let content = this.elInUser.value;
|
||||||
|
@ -344,7 +452,7 @@ class MultiChatUI {
|
||||||
if (apiEP == ApiEP.Chat) {
|
if (apiEP == ApiEP.Chat) {
|
||||||
theBody = chat.request_messages_jsonstr();
|
theBody = chat.request_messages_jsonstr();
|
||||||
} else {
|
} else {
|
||||||
theBody = chat.request_prompt_jsonstr();
|
theBody = chat.request_prompt_jsonstr(gMe.bCompletionInsertStandardRolePrefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
this.elInUser.value = "working...";
|
this.elInUser.value = "working...";
|
||||||
|
@ -359,6 +467,7 @@ class MultiChatUI {
|
||||||
});
|
});
|
||||||
|
|
||||||
let respBody = await resp.json();
|
let respBody = await resp.json();
|
||||||
|
//let respBody = await this.read_json_early(resp);
|
||||||
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
|
console.debug(`DBUG:SimpleChat:MCUI:${chatId}:HandleUserSubmit:RespBody:${JSON.stringify(respBody)}`);
|
||||||
let assistantMsg;
|
let assistantMsg;
|
||||||
if (apiEP == ApiEP.Chat) {
|
if (apiEP == ApiEP.Chat) {
|
||||||
|
@ -376,13 +485,6 @@ class MultiChatUI {
|
||||||
} else {
|
} else {
|
||||||
console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
|
console.debug(`DBUG:SimpleChat:MCUI:HandleUserSubmit:ChatId has changed:[${chatId}] [${this.curChatId}]`);
|
||||||
}
|
}
|
||||||
// Purposefully clear at end rather than begin of this function
|
|
||||||
// so that one can switch from chat to completion mode and sequece
|
|
||||||
// in a completion mode with multiple user-assistant chat data
|
|
||||||
// from before to be sent/occur once.
|
|
||||||
if ((apiEP == ApiEP.Completion) && (gbCompletionFreshChatAlways)) {
|
|
||||||
chat.xchat.length = 0;
|
|
||||||
}
|
|
||||||
this.ui_reset_userinput();
|
this.ui_reset_userinput();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -462,17 +564,66 @@ class MultiChatUI {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
let gMuitChat;
|
class Me {
|
||||||
const gChatIds = [ "Default", "Other" ];
|
|
||||||
|
constructor() {
|
||||||
|
this.defaultChatIds = [ "Default", "Other" ];
|
||||||
|
this.multiChat = new MultiChatUI();
|
||||||
|
this.bCompletionFreshChatAlways = true;
|
||||||
|
this.bCompletionInsertStandardRolePrefix = false;
|
||||||
|
this.iRecentUserMsgCnt = 2;
|
||||||
|
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||||
|
this.chatRequestOptions = {
|
||||||
|
"temperature": 0.7,
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"frequency_penalty": 1.2,
|
||||||
|
"presence_penalty": 1.2,
|
||||||
|
"n_predict": 1024
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param {HTMLDivElement} elDiv
|
||||||
|
*/
|
||||||
|
show_info(elDiv) {
|
||||||
|
|
||||||
|
var p = document.createElement("p");
|
||||||
|
p.innerText = "Settings (devel-tools-console gMe)";
|
||||||
|
p.className = "role-system";
|
||||||
|
elDiv.appendChild(p);
|
||||||
|
|
||||||
|
var p = document.createElement("p");
|
||||||
|
p.innerText = `bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`;
|
||||||
|
elDiv.appendChild(p);
|
||||||
|
|
||||||
|
p = document.createElement("p");
|
||||||
|
p.innerText = `bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`;
|
||||||
|
elDiv.appendChild(p);
|
||||||
|
|
||||||
|
p = document.createElement("p");
|
||||||
|
p.innerText = `iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`;
|
||||||
|
elDiv.appendChild(p);
|
||||||
|
|
||||||
|
p = document.createElement("p");
|
||||||
|
p.innerText = `chatRequestOptions:${JSON.stringify(this.chatRequestOptions)}`;
|
||||||
|
elDiv.appendChild(p);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** @type {Me} */
|
||||||
|
let gMe;
|
||||||
|
|
||||||
function startme() {
|
function startme() {
|
||||||
console.log("INFO:SimpleChat:StartMe:Starting...");
|
console.log("INFO:SimpleChat:StartMe:Starting...");
|
||||||
gMuitChat = new MultiChatUI();
|
gMe = new Me();
|
||||||
for (let cid of gChatIds) {
|
for (let cid of gMe.defaultChatIds) {
|
||||||
gMuitChat.new_chat_session(cid);
|
gMe.multiChat.new_chat_session(cid);
|
||||||
}
|
}
|
||||||
gMuitChat.setup_ui(gChatIds[0]);
|
gMe.multiChat.setup_ui(gMe.defaultChatIds[0], true);
|
||||||
gMuitChat.show_sessions();
|
gMe.multiChat.show_sessions();
|
||||||
}
|
}
|
||||||
|
|
||||||
document.addEventListener("DOMContentLoaded", startme);
|
document.addEventListener("DOMContentLoaded", startme);
|
||||||
|
|
|
@ -13,10 +13,10 @@ if %errorlevel% neq 0 goto ERROR
|
||||||
|
|
||||||
:: for FP16
|
:: for FP16
|
||||||
:: faster for long-prompt inference
|
:: faster for long-prompt inference
|
||||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
||||||
|
|
||||||
:: for FP32
|
:: for FP32
|
||||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||||
if %errorlevel% neq 0 goto ERROR
|
if %errorlevel% neq 0 goto ERROR
|
||||||
:: build example/main only
|
:: build example/main only
|
||||||
:: make main
|
:: make main
|
||||||
|
|
|
@ -3,40 +3,390 @@
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
#include <fstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
#if defined(_WIN32)
|
||||||
if (argc < 3 || argv[1][0] == '-') {
|
#define WIN32_LEAN_AND_MEAN
|
||||||
printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
|
#include <windows.h>
|
||||||
|
#include <shellapi.h> // For CommandLineToArgvW
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static void print_usage_information(const char * argv0, FILE * stream) {
|
||||||
|
fprintf(stream, "usage: %s [options]\n\n", argv0);
|
||||||
|
fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
|
||||||
|
fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
|
||||||
|
fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
|
||||||
|
fprintf(stream, "to control the behavior of the tokenizer.\n\n");
|
||||||
|
fprintf(stream, " The possible options are:\n");
|
||||||
|
fprintf(stream, "\n");
|
||||||
|
fprintf(stream, " -h, --help print this help and exit\n");
|
||||||
|
fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
|
||||||
|
fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
|
||||||
|
fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n");
|
||||||
|
fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
|
||||||
|
fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
|
||||||
|
fprintf(stream, " --stdin read prompt from standard input.\n");
|
||||||
|
fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
|
||||||
|
fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) {
|
||||||
|
(void) level;
|
||||||
|
(void) text;
|
||||||
|
(void) user_data;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string read_prompt_from_file(const char * filepath, bool & success) {
|
||||||
|
success = false;
|
||||||
|
|
||||||
|
std::ifstream in(filepath, std::ios::binary);
|
||||||
|
if (!in) {
|
||||||
|
fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno));
|
||||||
|
return std::string();
|
||||||
|
}
|
||||||
|
// do not assume the file is seekable (e.g. /dev/stdin)
|
||||||
|
std::stringstream buffer;
|
||||||
|
buffer << in.rdbuf();
|
||||||
|
if (in.fail()) {
|
||||||
|
fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno));
|
||||||
|
return std::string();
|
||||||
|
}
|
||||||
|
|
||||||
|
success = true;
|
||||||
|
return buffer.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Function: ingest_args(...) -> vector<string>
|
||||||
|
//
|
||||||
|
// Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded
|
||||||
|
// strings, as an STL vector<string>.
|
||||||
|
//
|
||||||
|
// In particular, it handles character encoding shenanigans on Windows.
|
||||||
|
//
|
||||||
|
// Note: raw_argc and raw_argv are not actually read at all on Windows.
|
||||||
|
// On Windows we call GetCommandLineW to get the arguments in wchar_t
|
||||||
|
// format, ignoring the regular argc/argv arguments to main().
|
||||||
|
//
|
||||||
|
// TODO: potential opportunity to roll common stuff into common/console.cpp
|
||||||
|
// in relation to Windows wchar_t shenanigans.
|
||||||
|
static std::vector<std::string> ingest_args(int raw_argc, char ** raw_argv) {
|
||||||
|
std::vector<std::string> argv;
|
||||||
|
|
||||||
|
// Handle Windows, if given non-ASCII arguments.
|
||||||
|
// We convert wchar_t arguments into UTF-8 char* on this platform.
|
||||||
|
// Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters
|
||||||
|
// without throwing tantrums.
|
||||||
|
#if defined(_WIN32)
|
||||||
|
int argc;
|
||||||
|
const LPWSTR cmdline_wargv = GetCommandLineW();
|
||||||
|
LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc);
|
||||||
|
|
||||||
|
// silence unused arg warnings
|
||||||
|
(void) raw_argc;
|
||||||
|
(void) raw_argv;
|
||||||
|
|
||||||
|
for (int i = 0; i < argc; ++i) {
|
||||||
|
int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL);
|
||||||
|
char * output_buf = (char *) calloc(length_needed+1, sizeof(char));
|
||||||
|
GGML_ASSERT(output_buf);
|
||||||
|
|
||||||
|
WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL);
|
||||||
|
output_buf[length_needed] = '\0';
|
||||||
|
|
||||||
|
argv.push_back(output_buf);
|
||||||
|
free(output_buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
LocalFree((HLOCAL) wargv);
|
||||||
|
#else
|
||||||
|
int argc = raw_argc;
|
||||||
|
for (int i = 0; i < argc; ++i) {
|
||||||
|
argv.push_back(raw_argv[i]);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
GGML_ASSERT((unsigned int) argc == argv.size());
|
||||||
|
|
||||||
|
return argv;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Function: write_utf8_cstr_to_stdout(const char *) -> <writes to stdout>
|
||||||
|
//
|
||||||
|
// writes a string to standard output; taking into account that on Windows
|
||||||
|
// to display correctly you have to use special handling. Works even if the
|
||||||
|
// user has not set a unicode code page on a Windows cmd.exe.
|
||||||
|
//
|
||||||
|
// In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something
|
||||||
|
// a human-readable is written instead.
|
||||||
|
//
|
||||||
|
// On non-Windows systems, simply printfs() the string.
|
||||||
|
static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
|
||||||
|
invalid_utf8 = false;
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
// Are we in a console?
|
||||||
|
HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
|
||||||
|
DWORD dwMode = 0;
|
||||||
|
|
||||||
|
// According to Microsoft docs:
|
||||||
|
// "WriteConsole fails if it is used with a standard handle that is redirected to a file."
|
||||||
|
// Also according to the docs, you can use GetConsoleMode to check for that.
|
||||||
|
if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
|
||||||
|
printf("%s", str);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// MultiByteToWideChar reports an error if str is empty, don't report
|
||||||
|
// them as invalid_utf8.
|
||||||
|
if (*str == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0);
|
||||||
|
if (length_needed == 0) {
|
||||||
|
DWORD err = GetLastError();
|
||||||
|
if (err == ERROR_NO_UNICODE_TRANSLATION) {
|
||||||
|
invalid_utf8 = true;
|
||||||
|
int len = strlen(str);
|
||||||
|
printf("<");
|
||||||
|
for (int i = 0; i < len; ++i) {
|
||||||
|
if (i > 0) {
|
||||||
|
printf(" ");
|
||||||
|
}
|
||||||
|
printf("%02x", (uint8_t) str[i]);
|
||||||
|
}
|
||||||
|
printf(">");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way.");
|
||||||
|
}
|
||||||
|
|
||||||
|
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));
|
||||||
|
GGML_ASSERT(wstr);
|
||||||
|
|
||||||
|
MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed);
|
||||||
|
WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL);
|
||||||
|
|
||||||
|
free(wstr);
|
||||||
|
#else
|
||||||
|
// TODO: reporting invalid_utf8 would be useful on non-Windows too.
|
||||||
|
// printf will silently just write bad unicode.
|
||||||
|
printf("%s", str);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int raw_argc, char ** raw_argv) {
|
||||||
|
const std::vector<std::string> argv = ingest_args(raw_argc, raw_argv);
|
||||||
|
const int argc = argv.size();
|
||||||
|
|
||||||
|
if (argc <= 1) {
|
||||||
|
print_usage_information(argv[0].c_str(), stderr);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * model_path = argv[1];
|
//////
|
||||||
const char * prompt = argv[2];
|
// Read out all the command line arguments.
|
||||||
|
//////
|
||||||
|
|
||||||
const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
|
// variables where to put any arguments we see.
|
||||||
|
bool printing_ids = false;
|
||||||
|
bool no_bos = false;
|
||||||
|
bool disable_logging = false;
|
||||||
|
const char * model_path = NULL;
|
||||||
|
const char * prompt_path = NULL;
|
||||||
|
const char * prompt_arg = NULL;
|
||||||
|
|
||||||
|
// track which arguments were explicitly given
|
||||||
|
// used for sanity checking down the line
|
||||||
|
bool model_path_set = false;
|
||||||
|
bool prompt_path_set = false;
|
||||||
|
bool prompt_set = false;
|
||||||
|
bool stdin_set = false;
|
||||||
|
|
||||||
|
int iarg = 1;
|
||||||
|
for (; iarg < argc; ++iarg) {
|
||||||
|
std::string arg{argv[iarg]};
|
||||||
|
if (arg == "-h" || arg == "--help") {
|
||||||
|
print_usage_information(argv[0].c_str(), stdout);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
else if (arg == "--ids") {
|
||||||
|
printing_ids = true;
|
||||||
|
}
|
||||||
|
else if (arg == "-m" || arg == "--model") {
|
||||||
|
if (model_path_set) {
|
||||||
|
fprintf(stderr, "Error: -m or --model specified multiple times.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
model_path = argv[++iarg].c_str();
|
||||||
|
model_path_set = true;
|
||||||
|
}
|
||||||
|
else if (arg == "--no-bos") {
|
||||||
|
no_bos = true;
|
||||||
|
}
|
||||||
|
else if (arg == "-p" || arg == "--prompt") {
|
||||||
|
if (prompt_set) {
|
||||||
|
fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
prompt_arg = argv[++iarg].c_str();
|
||||||
|
prompt_set = true;
|
||||||
|
}
|
||||||
|
else if (arg == "-f" || arg == "--file") {
|
||||||
|
if (prompt_path_set) {
|
||||||
|
fprintf(stderr, "Error: -f or --file specified multiple times.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
prompt_path = argv[++iarg].c_str();
|
||||||
|
prompt_path_set = true;
|
||||||
|
}
|
||||||
|
else if (arg == "--stdin") {
|
||||||
|
stdin_set = true;
|
||||||
|
}
|
||||||
|
else if (arg == "--log-disable") {
|
||||||
|
disable_logging = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//////
|
||||||
|
// Sanity check the command line arguments.
|
||||||
|
//////
|
||||||
|
|
||||||
|
// Check that we have the required stuff set.
|
||||||
|
if (model_path_set && model_path == NULL) {
|
||||||
|
fprintf(stderr, "Error: --model requires an argument.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (!model_path_set) {
|
||||||
|
fprintf(stderr, "Error: must specify --model.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (prompt_path_set && prompt_path == NULL) {
|
||||||
|
fprintf(stderr, "Error: --file requires an argument.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (prompt_set && prompt_arg == NULL) {
|
||||||
|
fprintf(stderr, "Error: --prompt requires an argument.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set);
|
||||||
|
if (prompts_set > 1) {
|
||||||
|
fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
// Must have some prompt.
|
||||||
|
if (prompts_set == 0) {
|
||||||
|
fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(model_path);
|
||||||
|
GGML_ASSERT(prompt_path || prompt_arg || stdin_set);
|
||||||
|
|
||||||
|
//////
|
||||||
|
// Figure out where will the prompt come from.
|
||||||
|
//////
|
||||||
|
|
||||||
|
std::string prompt;
|
||||||
|
if (prompt_path_set) {
|
||||||
|
bool success = false;
|
||||||
|
prompt = read_prompt_from_file(prompt_path, success);
|
||||||
|
if (!success) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else if (prompt_set) {
|
||||||
|
prompt = prompt_arg;
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(stdin_set);
|
||||||
|
// we read stdin *after* loading model (early exit if model cannot
|
||||||
|
// be loaded, which can be a nicer user experience)
|
||||||
|
}
|
||||||
|
|
||||||
|
//////
|
||||||
|
// Start actually doing the tokenizing stuff.
|
||||||
|
//////
|
||||||
|
|
||||||
|
#ifdef LOG_DISABLE_LOGS
|
||||||
|
disable_logging = true;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (disable_logging) {
|
||||||
|
llama_log_set(llama_log_callback_null, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_params.vocab_only = true;
|
model_params.vocab_only = true;
|
||||||
llama_model * model = llama_load_model_from_file(model_path, model_params);
|
llama_model * model = llama_load_model_from_file(model_path, model_params);
|
||||||
|
if (!model) {
|
||||||
|
fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_default_params();
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
if (!ctx) {
|
||||||
|
fprintf(stderr, "Error: could not create context.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// read entire prompt from stdin?
|
||||||
|
if (stdin_set) {
|
||||||
|
GGML_ASSERT(!prompt_path_set && !prompt_set);
|
||||||
|
|
||||||
|
std::stringstream stdin_buffer;
|
||||||
|
stdin_buffer << std::cin.rdbuf();
|
||||||
|
if (std::cin.fail()) {
|
||||||
|
fprintf(stderr, "Error: could not read the entire standard input.\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
prompt = stdin_buffer.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool model_wants_add_bos = llama_should_add_bos_token(model);
|
||||||
|
const bool add_bos = model_wants_add_bos && !no_bos;
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
|
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
||||||
|
|
||||||
tokens = ::llama_tokenize(model, prompt, true, true);
|
if (printing_ids) {
|
||||||
|
printf("[");
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < (int) tokens.size(); i++) {
|
for (int i = 0; i < (int) tokens.size(); i++) {
|
||||||
if (printing_ids) {
|
if (printing_ids) {
|
||||||
printf("%d\n", tokens[i]);
|
if (i > 0) {
|
||||||
|
printf(", ");
|
||||||
|
}
|
||||||
|
printf("%d", tokens[i]);
|
||||||
} else {
|
} else {
|
||||||
printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
|
bool invalid_utf8 = false;
|
||||||
|
printf("%6d -> '", tokens[i]);
|
||||||
|
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
|
||||||
|
if (invalid_utf8) {
|
||||||
|
printf("' (utf-8 decode failure)\n");
|
||||||
|
} else {
|
||||||
|
printf("'\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (printing_ids) {
|
||||||
|
printf("]\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// silence valgrind
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
12
flake.lock
generated
12
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1714641030,
|
"lastModified": 1715865404,
|
||||||
"narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=",
|
"narHash": "sha256-/GJvTdTpuDjNn84j82cU6bXztE0MSkdnTWClUCRub78=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e",
|
"rev": "8dc45382d5206bd292f9c2768b8058a8fd8311d9",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1714635257,
|
"lastModified": 1716509168,
|
||||||
"narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=",
|
"narHash": "sha256-4zSIhSRRIoEBwjbPm3YiGtbd8HDWzFxJjw5DYSDy1n8=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f",
|
"rev": "bfb7a882678e518398ce9a31a881538679f6f092",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
@ -144,6 +144,10 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
#include <arm_sve.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
// 16-bit float
|
// 16-bit float
|
||||||
// on Arm, we use __fp16
|
// on Arm, we use __fp16
|
||||||
// on x86, we use uint16_t
|
// on x86, we use uint16_t
|
||||||
|
|
|
@ -3813,7 +3813,44 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
||||||
|
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
||||||
|
|
||||||
|
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||||
|
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||||
|
|
||||||
|
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i += 2) {
|
||||||
|
const block_q4_0 * restrict x0 = &x[i + 0];
|
||||||
|
const block_q4_0 * restrict x1 = &x[i + 1];
|
||||||
|
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||||
|
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||||
|
|
||||||
|
// load x
|
||||||
|
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
||||||
|
const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit
|
||||||
|
const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
|
||||||
|
const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
|
||||||
|
|
||||||
|
// sub 8
|
||||||
|
const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
|
||||||
|
const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||||
|
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||||
|
|
||||||
|
// dot product
|
||||||
|
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
|
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
@ -5384,7 +5421,32 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(__ARM_NEON)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||||
|
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||||
|
|
||||||
|
assert(nb % 2 == 0); // TODO: handle odd nb
|
||||||
|
|
||||||
|
for (int i = 0; i < nb; i += 2) {
|
||||||
|
const block_q8_0 * restrict x0 = &x[i + 0];
|
||||||
|
const block_q8_0 * restrict x1 = &x[i + 1];
|
||||||
|
const block_q8_0 * restrict y0 = &y[i + 0];
|
||||||
|
const block_q8_0 * restrict y1 = &y[i + 1];
|
||||||
|
|
||||||
|
// load x
|
||||||
|
const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
|
||||||
|
const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||||
|
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||||
|
|
||||||
|
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
|
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
|
}
|
||||||
|
|
||||||
|
*s = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||||
|
#elif defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||||
|
|
||||||
|
|
10
ggml.c
10
ggml.c
|
@ -22742,6 +22742,16 @@ int ggml_cpu_has_neon(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int ggml_cpu_has_sve(void) {
|
||||||
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
|
// TODO: Currently, SVE 256 bit is only supported.
|
||||||
|
GGML_ASSERT(svcntb() == QK8_0);
|
||||||
|
return 1;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
int ggml_cpu_has_arm_fma(void) {
|
int ggml_cpu_has_arm_fma(void) {
|
||||||
#if defined(__ARM_FEATURE_FMA)
|
#if defined(__ARM_FEATURE_FMA)
|
||||||
return 1;
|
return 1;
|
||||||
|
|
1
ggml.h
1
ggml.h
|
@ -2404,6 +2404,7 @@ extern "C" {
|
||||||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
||||||
GGML_API int ggml_cpu_has_fma (void);
|
GGML_API int ggml_cpu_has_fma (void);
|
||||||
GGML_API int ggml_cpu_has_neon (void);
|
GGML_API int ggml_cpu_has_neon (void);
|
||||||
|
GGML_API int ggml_cpu_has_sve (void);
|
||||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||||
GGML_API int ggml_cpu_has_metal (void);
|
GGML_API int ggml_cpu_has_metal (void);
|
||||||
GGML_API int ggml_cpu_has_f16c (void);
|
GGML_API int ggml_cpu_has_f16c (void);
|
||||||
|
|
|
@ -139,6 +139,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
COMMAND_R = auto()
|
COMMAND_R = auto()
|
||||||
DBRX = auto()
|
DBRX = auto()
|
||||||
OLMO = auto()
|
OLMO = auto()
|
||||||
|
ARCTIC = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -167,6 +168,7 @@ class MODEL_TENSOR(IntEnum):
|
||||||
FFN_DOWN = auto()
|
FFN_DOWN = auto()
|
||||||
FFN_UP = auto()
|
FFN_UP = auto()
|
||||||
FFN_ACT = auto()
|
FFN_ACT = auto()
|
||||||
|
FFN_NORM_EXP = auto()
|
||||||
FFN_GATE_EXP = auto()
|
FFN_GATE_EXP = auto()
|
||||||
FFN_DOWN_EXP = auto()
|
FFN_DOWN_EXP = auto()
|
||||||
FFN_UP_EXP = auto()
|
FFN_UP_EXP = auto()
|
||||||
|
@ -218,6 +220,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.COMMAND_R: "command-r",
|
MODEL_ARCH.COMMAND_R: "command-r",
|
||||||
MODEL_ARCH.DBRX: "dbrx",
|
MODEL_ARCH.DBRX: "dbrx",
|
||||||
MODEL_ARCH.OLMO: "olmo",
|
MODEL_ARCH.OLMO: "olmo",
|
||||||
|
MODEL_ARCH.ARCTIC: "arctic",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -251,6 +254,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
||||||
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
||||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||||
|
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
||||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
||||||
|
@ -732,6 +736,27 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.FFN_DOWN,
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP,
|
MODEL_TENSOR.FFN_UP,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.ARCTIC: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_GATE_INP,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
MODEL_TENSOR.FFN_NORM_EXP,
|
||||||
|
MODEL_TENSOR.FFN_GATE_EXP,
|
||||||
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||||
|
MODEL_TENSOR.FFN_UP_EXP,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,6 +12,8 @@ from typing import Any, Literal, NamedTuple, TypeVar, Union
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import numpy.typing as npt
|
import numpy.typing as npt
|
||||||
|
|
||||||
|
from .quants import quant_shape_to_byte_shape
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -251,6 +253,7 @@ class GGUFReader:
|
||||||
tensor_names.add(tensor_name)
|
tensor_names.add(tensor_name)
|
||||||
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
||||||
n_elems = int(np.prod(dims))
|
n_elems = int(np.prod(dims))
|
||||||
|
np_dims = tuple(reversed(dims.tolist()))
|
||||||
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
||||||
n_bytes = n_elems * type_size // block_size
|
n_bytes = n_elems * type_size // block_size
|
||||||
data_offs = int(start_offs + offset_tensor[0])
|
data_offs = int(start_offs + offset_tensor[0])
|
||||||
|
@ -279,6 +282,7 @@ class GGUFReader:
|
||||||
else:
|
else:
|
||||||
item_count = n_bytes
|
item_count = n_bytes
|
||||||
item_type = np.uint8
|
item_type = np.uint8
|
||||||
|
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
|
||||||
tensors.append(ReaderTensor(
|
tensors.append(ReaderTensor(
|
||||||
name = tensor_name,
|
name = tensor_name,
|
||||||
tensor_type = ggml_type,
|
tensor_type = ggml_type,
|
||||||
|
@ -286,7 +290,7 @@ class GGUFReader:
|
||||||
n_elements = n_elems,
|
n_elements = n_elems,
|
||||||
n_bytes = n_bytes,
|
n_bytes = n_bytes,
|
||||||
data_offset = data_offs,
|
data_offset = data_offs,
|
||||||
data = self._get(data_offs, item_type, item_count),
|
data = self._get(data_offs, item_type, item_count).reshape(np_dims),
|
||||||
field = field,
|
field = field,
|
||||||
))
|
))
|
||||||
self.tensors = tensors
|
self.tensors = tensors
|
||||||
|
|
|
@ -13,7 +13,6 @@ from string import ascii_letters, digits
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from .constants import (
|
from .constants import (
|
||||||
GGML_QUANT_SIZES,
|
|
||||||
GGUF_DEFAULT_ALIGNMENT,
|
GGUF_DEFAULT_ALIGNMENT,
|
||||||
GGUF_MAGIC,
|
GGUF_MAGIC,
|
||||||
GGUF_VERSION,
|
GGUF_VERSION,
|
||||||
|
@ -26,6 +25,8 @@ from .constants import (
|
||||||
TokenType,
|
TokenType,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from .quants import quant_shape_from_byte_shape
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@ -229,10 +230,7 @@ class GGUFWriter:
|
||||||
else:
|
else:
|
||||||
dtype = raw_dtype
|
dtype = raw_dtype
|
||||||
if tensor_dtype == np.uint8:
|
if tensor_dtype == np.uint8:
|
||||||
block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
|
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
|
||||||
if tensor_shape[-1] % type_size != 0:
|
|
||||||
raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
|
|
||||||
tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
|
|
||||||
n_dims = len(tensor_shape)
|
n_dims = len(tensor_shape)
|
||||||
self.ti_data += self._pack("I", n_dims)
|
self.ti_data += self._pack("I", n_dims)
|
||||||
for i in range(n_dims):
|
for i in range(n_dims):
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
from typing import Callable
|
from typing import Callable, Sequence
|
||||||
|
|
||||||
from numpy.typing import DTypeLike
|
from numpy.typing import DTypeLike
|
||||||
|
|
||||||
|
@ -9,6 +9,20 @@ from .lazy import LazyNumpyTensor
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||||
|
if shape[-1] % block_size != 0:
|
||||||
|
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
|
||||||
|
return (*shape[:-1], shape[-1] // block_size * type_size)
|
||||||
|
|
||||||
|
|
||||||
|
def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
|
||||||
|
block_size, type_size = GGML_QUANT_SIZES[quant_type]
|
||||||
|
if shape[-1] % type_size != 0:
|
||||||
|
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
|
||||||
|
return (*shape[:-1], shape[-1] // type_size * block_size)
|
||||||
|
|
||||||
|
|
||||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||||
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
|
||||||
n = n.astype(np.float32, copy=False).view(np.int32)
|
n = n.astype(np.float32, copy=False).view(np.int32)
|
||||||
|
|
|
@ -244,6 +244,7 @@ class TensorNameMap:
|
||||||
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc11", # nomic-bert
|
||||||
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
"model.layers.{bid}.mlp.c_fc", # starcoder2
|
||||||
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.gated_layers_v", # jina-bert-v2
|
||||||
|
"model.layers.{bid}.residual_mlp.w3", # arctic
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_UP_EXP: (
|
MODEL_TENSOR.FFN_UP_EXP: (
|
||||||
|
@ -272,6 +273,7 @@ class TensorNameMap:
|
||||||
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc12", # nomic-bert
|
||||||
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2
|
||||||
"transformer.h.{bid}.mlp.linear_1", # refact
|
"transformer.h.{bid}.mlp.linear_1", # refact
|
||||||
|
"model.layers.{bid}.residual_mlp.w1", # arctic
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_GATE_EXP: (
|
MODEL_TENSOR.FFN_GATE_EXP: (
|
||||||
|
@ -306,6 +308,7 @@ class TensorNameMap:
|
||||||
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
"encoder.layers.{bid}.mlp.fc2", # nomic-bert
|
||||||
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
"model.layers.{bid}.mlp.c_proj", # starcoder2
|
||||||
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
"encoder.layer.{bid}.mlp.wo", # jina-bert-v2
|
||||||
|
"model.layers.{bid}.residual_mlp.w2", # arctic
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.FFN_DOWN_EXP: (
|
MODEL_TENSOR.FFN_DOWN_EXP: (
|
||||||
|
@ -382,6 +385,18 @@ class TensorNameMap:
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# architecture-specific block mappings
|
||||||
|
arch_block_mappings_cfg: dict[MODEL_ARCH, dict[MODEL_TENSOR, tuple[str, ...]]] = {
|
||||||
|
MODEL_ARCH.ARCTIC: {
|
||||||
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
|
"model.layers.{bid}.residual_layernorm",
|
||||||
|
),
|
||||||
|
MODEL_TENSOR.FFN_NORM_EXP: (
|
||||||
|
"model.layers.{bid}.post_attention_layernorm",
|
||||||
|
),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||||
|
|
||||||
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
|
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
|
||||||
|
@ -393,12 +408,14 @@ class TensorNameMap:
|
||||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
for key in keys:
|
for key in keys:
|
||||||
self.mapping[key] = (tensor, tensor_name)
|
self.mapping[key] = (tensor, tensor_name)
|
||||||
|
if arch in self.arch_block_mappings_cfg:
|
||||||
|
self.block_mappings_cfg.update(self.arch_block_mappings_cfg[arch])
|
||||||
for bid in range(n_blocks):
|
for bid in range(n_blocks):
|
||||||
for tensor, keys in self.block_mappings_cfg.items():
|
for tensor, keys in self.block_mappings_cfg.items():
|
||||||
if tensor not in MODEL_TENSORS[arch]:
|
if tensor not in MODEL_TENSORS[arch]:
|
||||||
continue
|
continue
|
||||||
# TODO: make this configurable
|
# TODO: make this configurable
|
||||||
n_experts = 60
|
n_experts = 128
|
||||||
for xid in range(n_experts):
|
for xid in range(n_experts):
|
||||||
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid)
|
||||||
self.mapping[tensor_name] = (tensor, tensor_name)
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
|
|
|
@ -118,9 +118,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
|
||||||
|
|
||||||
for tensor in reader.tensors:
|
for tensor in reader.tensors:
|
||||||
total_bytes += tensor.n_bytes
|
total_bytes += tensor.n_bytes
|
||||||
# Dimensions are written in reverse order, so flip them first
|
writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
||||||
shape = np.flipud(tensor.shape).tolist()
|
|
||||||
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
|
||||||
|
|
||||||
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||||
|
|
||||||
|
|
313
llama.cpp
313
llama.cpp
|
@ -103,7 +103,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LLAMA_MAX_NODES 8192
|
#define LLAMA_MAX_NODES 8192
|
||||||
#define LLAMA_MAX_EXPERTS 60
|
#define LLAMA_MAX_EXPERTS 128
|
||||||
|
|
||||||
//
|
//
|
||||||
// logging
|
// logging
|
||||||
|
@ -221,6 +221,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_COMMAND_R,
|
LLM_ARCH_COMMAND_R,
|
||||||
LLM_ARCH_DBRX,
|
LLM_ARCH_DBRX,
|
||||||
LLM_ARCH_OLMO,
|
LLM_ARCH_OLMO,
|
||||||
|
LLM_ARCH_ARCTIC,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -257,6 +258,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_COMMAND_R, "command-r" },
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
||||||
{ LLM_ARCH_DBRX, "dbrx" },
|
{ LLM_ARCH_DBRX, "dbrx" },
|
||||||
{ LLM_ARCH_OLMO, "olmo" },
|
{ LLM_ARCH_OLMO, "olmo" },
|
||||||
|
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -459,6 +461,7 @@ enum llm_tensor {
|
||||||
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility
|
||||||
LLM_TENSOR_FFN_GATE_EXP,
|
LLM_TENSOR_FFN_GATE_EXP,
|
||||||
LLM_TENSOR_FFN_UP_EXP,
|
LLM_TENSOR_FFN_UP_EXP,
|
||||||
|
LLM_TENSOR_FFN_NORM_EXPS,
|
||||||
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
LLM_TENSOR_FFN_DOWN_EXPS, // merged experts
|
||||||
LLM_TENSOR_FFN_GATE_EXPS,
|
LLM_TENSOR_FFN_GATE_EXPS,
|
||||||
LLM_TENSOR_FFN_UP_EXPS,
|
LLM_TENSOR_FFN_UP_EXPS,
|
||||||
|
@ -1036,6 +1039,28 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_ARCTIC,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
{ LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||||
|
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
@ -1736,6 +1761,7 @@ enum e_model {
|
||||||
MODEL_8x7B,
|
MODEL_8x7B,
|
||||||
MODEL_8x22B,
|
MODEL_8x22B,
|
||||||
MODEL_16x12B,
|
MODEL_16x12B,
|
||||||
|
MODEL_10B_128x3_66B,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t kiB = 1024;
|
static const size_t kiB = 1024;
|
||||||
|
@ -1911,6 +1937,7 @@ struct llama_layer {
|
||||||
struct ggml_tensor * ffn_norm_b;
|
struct ggml_tensor * ffn_norm_b;
|
||||||
struct ggml_tensor * layer_out_norm;
|
struct ggml_tensor * layer_out_norm;
|
||||||
struct ggml_tensor * layer_out_norm_b;
|
struct ggml_tensor * layer_out_norm_b;
|
||||||
|
struct ggml_tensor * ffn_norm_exps;
|
||||||
|
|
||||||
// ff
|
// ff
|
||||||
struct ggml_tensor * ffn_gate; // w1
|
struct ggml_tensor * ffn_gate; // w1
|
||||||
|
@ -3785,47 +3812,48 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||||
|
|
||||||
static const char * llama_model_type_name(e_model type) {
|
static const char * llama_model_type_name(e_model type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case MODEL_14M: return "14M";
|
case MODEL_14M: return "14M";
|
||||||
case MODEL_17M: return "17M";
|
case MODEL_17M: return "17M";
|
||||||
case MODEL_22M: return "22M";
|
case MODEL_22M: return "22M";
|
||||||
case MODEL_33M: return "33M";
|
case MODEL_33M: return "33M";
|
||||||
case MODEL_70M: return "70M";
|
case MODEL_70M: return "70M";
|
||||||
case MODEL_109M: return "109M";
|
case MODEL_109M: return "109M";
|
||||||
case MODEL_137M: return "137M";
|
case MODEL_137M: return "137M";
|
||||||
case MODEL_160M: return "160M";
|
case MODEL_160M: return "160M";
|
||||||
case MODEL_335M: return "335M";
|
case MODEL_335M: return "335M";
|
||||||
case MODEL_410M: return "410M";
|
case MODEL_410M: return "410M";
|
||||||
case MODEL_0_5B: return "0.5B";
|
case MODEL_0_5B: return "0.5B";
|
||||||
case MODEL_1B: return "1B";
|
case MODEL_1B: return "1B";
|
||||||
case MODEL_1_4B: return "1.4B";
|
case MODEL_1_4B: return "1.4B";
|
||||||
case MODEL_2B: return "2B";
|
case MODEL_2B: return "2B";
|
||||||
case MODEL_2_8B: return "2.8B";
|
case MODEL_2_8B: return "2.8B";
|
||||||
case MODEL_3B: return "3B";
|
case MODEL_3B: return "3B";
|
||||||
case MODEL_4B: return "4B";
|
case MODEL_4B: return "4B";
|
||||||
case MODEL_6_9B: return "6.9B";
|
case MODEL_6_9B: return "6.9B";
|
||||||
case MODEL_7B: return "7B";
|
case MODEL_7B: return "7B";
|
||||||
case MODEL_8B: return "8B";
|
case MODEL_8B: return "8B";
|
||||||
case MODEL_12B: return "12B";
|
case MODEL_12B: return "12B";
|
||||||
case MODEL_13B: return "13B";
|
case MODEL_13B: return "13B";
|
||||||
case MODEL_14B: return "14B";
|
case MODEL_14B: return "14B";
|
||||||
case MODEL_15B: return "15B";
|
case MODEL_15B: return "15B";
|
||||||
case MODEL_20B: return "20B";
|
case MODEL_20B: return "20B";
|
||||||
case MODEL_30B: return "30B";
|
case MODEL_30B: return "30B";
|
||||||
case MODEL_34B: return "34B";
|
case MODEL_34B: return "34B";
|
||||||
case MODEL_35B: return "35B";
|
case MODEL_35B: return "35B";
|
||||||
case MODEL_40B: return "40B";
|
case MODEL_40B: return "40B";
|
||||||
case MODEL_65B: return "65B";
|
case MODEL_65B: return "65B";
|
||||||
case MODEL_70B: return "70B";
|
case MODEL_70B: return "70B";
|
||||||
case MODEL_314B: return "314B";
|
case MODEL_314B: return "314B";
|
||||||
case MODEL_SMALL: return "0.1B";
|
case MODEL_SMALL: return "0.1B";
|
||||||
case MODEL_MEDIUM: return "0.4B";
|
case MODEL_MEDIUM: return "0.4B";
|
||||||
case MODEL_LARGE: return "0.8B";
|
case MODEL_LARGE: return "0.8B";
|
||||||
case MODEL_XL: return "1.5B";
|
case MODEL_XL: return "1.5B";
|
||||||
case MODEL_A2_7B: return "A2.7B";
|
case MODEL_A2_7B: return "A2.7B";
|
||||||
case MODEL_8x7B: return "8x7B";
|
case MODEL_8x7B: return "8x7B";
|
||||||
case MODEL_8x22B: return "8x22B";
|
case MODEL_8x22B: return "8x22B";
|
||||||
case MODEL_16x12B: return "16x12B";
|
case MODEL_16x12B: return "16x12B";
|
||||||
default: return "?B";
|
case MODEL_10B_128x3_66B: return "10B+128x3.66B";
|
||||||
|
default: return "?B";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4347,6 +4375,19 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_ARCTIC:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
|
||||||
|
if (hparams.n_expert == 128) {
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 35: model.type = e_model::MODEL_10B_128x3_66B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default: (void)0;
|
default: (void)0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4556,6 +4597,9 @@ static void llm_load_vocab(
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "dbrx") {
|
tokenizer_pre == "dbrx") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "smaug-bpe") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_SMAUG;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
}
|
}
|
||||||
|
@ -6133,6 +6177,46 @@ static bool llm_load_tensors(
|
||||||
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_ARCTIC:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
|
||||||
|
// output
|
||||||
|
{
|
||||||
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED);
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (model.output == NULL) {
|
||||||
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
|
||||||
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_embd});
|
||||||
|
|
||||||
|
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert});
|
||||||
|
layer.ffn_norm_exps = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM_EXPS, "weight", i), {n_embd});
|
||||||
|
layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false);
|
||||||
|
layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert});
|
||||||
|
layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert});
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("unknown architecture");
|
throw std::runtime_error("unknown architecture");
|
||||||
}
|
}
|
||||||
|
@ -10794,6 +10878,140 @@ struct llm_build_context {
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_arctic() {
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||||
|
int32_t n_tokens = this->n_tokens;
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
|
Qcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_ext(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, nullptr,
|
||||||
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
|
||||||
|
model.layers[il].wo, NULL,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (il == n_layer - 1) {
|
||||||
|
// skip computing output for unused tokens
|
||||||
|
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||||
|
n_tokens = n_outputs;
|
||||||
|
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||||
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
cb(ffn_inp, "ffn_inp", il);
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||||
|
model.layers[il].ffn_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm", il);
|
||||||
|
|
||||||
|
cur = llm_build_ffn(ctx0, cur,
|
||||||
|
model.layers[il].ffn_up, NULL,
|
||||||
|
model.layers[il].ffn_gate, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
struct ggml_tensor * ffn_out = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(ffn_out, "ffn_out", il);
|
||||||
|
|
||||||
|
// MoE
|
||||||
|
cur = llm_build_norm(ctx0, inpSA, hparams,
|
||||||
|
model.layers[il].ffn_norm_exps, NULL,
|
||||||
|
LLM_NORM_RMS, cb, il);
|
||||||
|
cb(cur, "ffn_norm_exps", il);
|
||||||
|
|
||||||
|
cur = llm_build_moe_ffn(ctx0, cur,
|
||||||
|
model.layers[il].ffn_gate_inp,
|
||||||
|
model.layers[il].ffn_up_exps,
|
||||||
|
model.layers[il].ffn_gate_exps,
|
||||||
|
model.layers[il].ffn_down_exps,
|
||||||
|
n_expert, n_expert_used,
|
||||||
|
LLM_FFN_SILU, true,
|
||||||
|
cb, il);
|
||||||
|
cb(cur, "ffn_moe_out", il);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, ffn_out);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
||||||
|
if (layer_dir != nullptr) {
|
||||||
|
cur = ggml_add(ctx0, cur, layer_dir);
|
||||||
|
}
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM_RMS, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||||
|
@ -11008,6 +11226,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_gptneox();
|
result = llm.build_gptneox();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_ARCTIC:
|
||||||
|
{
|
||||||
|
result = llm.build_arctic();
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
@ -12297,6 +12519,7 @@ struct llm_tokenizer_bpe {
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_SMAUG:
|
||||||
word_collection = unicode_regex_split(text, {
|
word_collection = unicode_regex_split(text, {
|
||||||
// same as llama3
|
// same as llama3
|
||||||
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
@ -16019,6 +16242,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_XVERSE:
|
case LLM_ARCH_XVERSE:
|
||||||
case LLM_ARCH_COMMAND_R:
|
case LLM_ARCH_COMMAND_R:
|
||||||
case LLM_ARCH_OLMO:
|
case LLM_ARCH_OLMO:
|
||||||
|
case LLM_ARCH_ARCTIC:
|
||||||
return LLAMA_ROPE_TYPE_NORM;
|
return LLAMA_ROPE_TYPE_NORM;
|
||||||
|
|
||||||
// the pairs of head values are offset by n_rot/2
|
// the pairs of head values are offset by n_rot/2
|
||||||
|
@ -17645,6 +17869,10 @@ bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_token_is_control(const struct llama_model * model, llama_token token) {
|
||||||
|
return llama_is_control_token(model->vocab, token);
|
||||||
|
}
|
||||||
|
|
||||||
llama_token llama_token_bos(const struct llama_model * model) {
|
llama_token llama_token_bos(const struct llama_model * model) {
|
||||||
return model->vocab.special_bos_id;
|
return model->vocab.special_bos_id;
|
||||||
}
|
}
|
||||||
|
@ -18121,6 +18349,7 @@ const char * llama_print_system_info(void) {
|
||||||
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | ";
|
||||||
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
||||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||||
|
s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | ";
|
||||||
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||||
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||||
|
|
4
llama.h
4
llama.h
|
@ -85,6 +85,7 @@ extern "C" {
|
||||||
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
|
||||||
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
|
||||||
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
|
@ -823,6 +824,9 @@ extern "C" {
|
||||||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||||
|
|
||||||
|
// Identify if Token Id is a control token or a render-able token
|
||||||
|
LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
|
||||||
|
|
||||||
// Special tokens
|
// Special tokens
|
||||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue