Merge branch 'master' into concedo_experimental

# Conflicts:
#	.gitignore
#	README.md
#	tests/CMakeLists.txt
This commit is contained in:
Concedo 2023-08-24 15:21:24 +08:00
commit b8372d4466
51 changed files with 2120 additions and 825 deletions

View file

@ -0,0 +1,58 @@
# SRPM for building from source and packaging an RPM for RPM-based distros.
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
# Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
# Notes for llama.cpp:
# 1. Tags are currently based on hash - which will not sort asciibetically.
# We need to declare standard versioning if people want to sort latest releases.
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support.
Name: llama.cpp-clblast
Version: master
Release: 1%{?dist}
Summary: OpenCL Inference of LLaMA model in pure C/C++
License: MIT
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel
URL: https://github.com/ggerganov/llama.cpp
%define debug_package %{nil}
%define source_date_epoch_from_changelog 0
%description
CPU inference for Meta's Lllama2 models using default options.
%prep
%setup -n llama.cpp-master
%build
make -j LLAMA_CLBLAST=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamacppclblast
cp -p server %{buildroot}%{_bindir}/llamacppclblastserver
cp -p simple %{buildroot}%{_bindir}/llamacppclblastsimple
%clean
rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llamacppclblast
%{_bindir}/llamacppclblastserver
%{_bindir}/llamacppclblastsimple
%pre
%post
%preun
%postun
%changelog

View file

@ -0,0 +1,59 @@
# SRPM for building from source and packaging an RPM for RPM-based distros.
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
# Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
# Notes for llama.cpp:
# 1. Tags are currently based on hash - which will not sort asciibetically.
# We need to declare standard versioning if people want to sort latest releases.
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support.
Name: llama.cpp-cublas
Version: master
Release: 1%{?dist}
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
License: MIT
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
BuildRequires: coreutils make gcc-c++ git cuda-toolkit
Requires: cuda-toolkit
URL: https://github.com/ggerganov/llama.cpp
%define debug_package %{nil}
%define source_date_epoch_from_changelog 0
%description
CPU inference for Meta's Lllama2 models using default options.
%prep
%setup -n llama.cpp-master
%build
make -j LLAMA_CUBLAS=1
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamacppcublas
cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
%clean
rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llamacppcublas
%{_bindir}/llamacppcublasserver
%{_bindir}/llamacppcublassimple
%pre
%post
%preun
%postun
%changelog

View file

@ -0,0 +1,58 @@
# SRPM for building from source and packaging an RPM for RPM-based distros.
# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
# Built and maintained by John Boero - boeroboy@gmail.com
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
# Notes for llama.cpp:
# 1. Tags are currently based on hash - which will not sort asciibetically.
# We need to declare standard versioning if people want to sort latest releases.
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
# It is up to the user to install the correct vendor-specific support.
Name: llama.cpp
Version: master
Release: 1%{?dist}
Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
License: MIT
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
BuildRequires: coreutils make gcc-c++ git
URL: https://github.com/ggerganov/llama.cpp
%define debug_package %{nil}
%define source_date_epoch_from_changelog 0
%description
CPU inference for Meta's Lllama2 models using default options.
%prep
%autosetup
%build
make -j
%install
mkdir -p %{buildroot}%{_bindir}/
cp -p main %{buildroot}%{_bindir}/llamacpp
cp -p server %{buildroot}%{_bindir}/llamacppserver
cp -p simple %{buildroot}%{_bindir}/llamacppsimple
%clean
rm -rf %{buildroot}
rm -rf %{_builddir}/*
%files
%{_bindir}/llamacpp
%{_bindir}/llamacppserver
%{_bindir}/llamacppsimple
%pre
%post
%preun
%postun
%changelog

0
ci/run.sh Normal file → Executable file
View file

View file

@ -417,6 +417,18 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.antiprompt.push_back(argv[i]); params.antiprompt.push_back(argv[i]);
} else if (arg == "--perplexity") { } else if (arg == "--perplexity") {
params.perplexity = true; params.perplexity = true;
} else if (arg == "--ppl-stride") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.ppl_stride = std::stoi(argv[i]);
} else if (arg == "--ppl-output-type") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.ppl_output_type = std::stoi(argv[i]);
} else if (arg == "--hellaswag") { } else if (arg == "--hellaswag") {
params.hellaswag = true; params.hellaswag = true;
} else if (arg == "--hellaswag-tasks") { } else if (arg == "--hellaswag-tasks") {
@ -732,35 +744,3 @@ std::string llama_token_to_str(const struct llama_context * ctx, llama_token tok
return std::string(result.data(), result.size()); return std::string(result.data(), result.size());
} }
std::vector<llama_token> llama_tokenize_bpe(
struct llama_context * ctx,
const std::string & text,
bool add_bos) {
int n_tokens = text.length() + add_bos;
std::vector<llama_token> result(n_tokens);
n_tokens = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_tokenize_bpe(ctx, text.c_str(), result.data(), result.size(), add_bos);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return result;
}
std::string llama_token_to_str_bpe(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
if (n_tokens < 0) {
result.resize(-n_tokens);
const int check = llama_token_to_str_bpe(ctx, token, result.data(), result.size());
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
}
return std::string(result.data(), result.size());
}

View file

@ -64,6 +64,10 @@ struct gpt_params {
std::string lora_adapter = ""; // lora adapter path std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter std::string lora_base = ""; // base model path for the lora adapter
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
//
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
@ -116,15 +120,6 @@ std::vector<llama_token> llama_tokenize(
const std::string & text, const std::string & text,
bool add_bos); bool add_bos);
std::vector<llama_token> llama_tokenize_bpe(
struct llama_context * ctx,
const std::string & text,
bool add_bos);
std::string llama_token_to_str( std::string llama_token_to_str(
const struct llama_context * ctx, const struct llama_context * ctx,
llama_token token); llama_token token);
std::string llama_token_to_str_bpe(
const struct llama_context * ctx,
llama_token token);

54
convert-falcon-hf-to-gguf.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# HF falcon--> gguf conversion # HF falcon--> gguf conversion
import gguf import gguf
@ -94,14 +95,17 @@ print("gguf: get model metadata")
block_count = hparams["n_layer"] block_count = hparams["n_layer"]
gguf_writer.add_name(last_dir) gguf_writer.add_name("Falcon")
gguf_writer.add_context_length(2048) # not in config.json gguf_writer.add_context_length(2048) # not in config.json
gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
gguf_writer.add_embedding_length(hparams["hidden_size"]) gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"]) gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
gguf_writer.add_block_count(block_count) gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["n_head"]) gguf_writer.add_head_count(hparams["n_head"])
if "n_head_kv" in hparams: gguf_writer.add_head_count_kv(hparams["n_head_kv"]) if "n_head_kv" in hparams:
gguf_writer.add_head_count_kv(hparams["n_head_kv"])
else:
gguf_writer.add_head_count_kv(1)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
# TOKENIZATION # TOKENIZATION
@ -109,6 +113,8 @@ gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
print("gguf: get tokenizer metadata") print("gguf: get tokenizer metadata")
tokens: List[str] = [] tokens: List[str] = []
scores: List[float] = []
toktypes: List[int] = []
merges: List[str] = [] merges: List[str] = []
@ -152,41 +158,30 @@ if Path(dir_model + "/tokenizer.json").is_file():
text = bytearray(pad_token) text = bytearray(pad_token)
tokens.append(text) tokens.append(text)
scores.append(0.0) # dymmy
toktypes.append(gguf.TokenType.NORMAL) # dummy
gguf_writer.add_token_list(tokens) gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
if "added_tokens" in tokenizer_json and Path(dir_model + "/tokenizer_config.json").is_file(): print("gguf: get special token ids")
print("gguf: get special token ids") # Look for special tokens in config.json
with open(dir_model + "/tokenizer_config.json", "r", encoding="utf-8") as f: if "bos_token_id" in hparams and hparams["bos_token_id"] != None:
tokenizer_config = json.load(f) gguf_writer.add_bos_token_id(hparams["bos_token_id"])
# find special token ids if "eos_token_id" in hparams and hparams["eos_token_id"] != None:
gguf_writer.add_eos_token_id(hparams["eos_token_id"])
if "bos_token" in tokenizer_config: if "unk_token_id" in hparams and hparams["unk_token_id"] != None:
for key in tokenizer_json["added_tokens"]: gguf_writer.add_unk_token_id(hparams["unk_token_id"])
if key["content"] == tokenizer_config["bos_token"]:
gguf_writer.add_bos_token_id(key["id"])
if "eos_token" in tokenizer_config: if "sep_token_id" in hparams and hparams["sep_token_id"] != None:
for key in tokenizer_json["added_tokens"]: gguf_writer.add_sep_token_id(hparams["sep_token_id"])
if key["content"] == tokenizer_config["eos_token"]:
gguf_writer.add_eos_token_id(key["id"])
if "unk_token" in tokenizer_config: if "pad_token_id" in hparams and hparams["pad_token_id"] != None:
for key in tokenizer_json["added_tokens"]: gguf_writer.add_pad_token_id(hparams["pad_token_id"])
if key["content"] == tokenizer_config["unk_token"]:
gguf_writer.add_unk_token_id(key["id"])
if "sep_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["sep_token"]:
gguf_writer.add_sep_token_id(key["id"])
if "pad_token" in tokenizer_config:
for key in tokenizer_json["added_tokens"]:
if key["content"] == tokenizer_config["pad_token"]:
gguf_writer.add_pad_token_id(key["id"])
# TENSORS # TENSORS
@ -196,6 +191,7 @@ tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform # params for qkv transform
n_head = hparams["n_head"] n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1 n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
head_dim = hparams["hidden_size"] // n_head head_dim = hparams["hidden_size"] // n_head
# tensor info # tensor info

1
convert-gptneox-hf-to-gguf.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# HF gptneox--> gguf conversion # HF gptneox--> gguf conversion
import gguf import gguf

1
convert-llama-7b-pth-to-gguf.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# 7b pth llama --> gguf conversion # 7b pth llama --> gguf conversion
# Only models with a single datafile are supported, like 7B # Only models with a single datafile are supported, like 7B
# HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model # HF files required in the model dir: config.json tokenizer_config.json tokenizer.json tokenizer.model

11
convert-llama-ggmlv3-to-gguf.py Normal file → Executable file
View file

@ -1,12 +1,11 @@
import sys, struct, math, argparse, warnings #!/usr/bin/env python3
import sys, struct, math, argparse
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import gguf import gguf
warnings.filterwarnings('error')
# Note: Does not support GGML_QKK_64 # Note: Does not support GGML_QKK_64
QK_K = 256 QK_K = 256
# Items here are (block size, type size) # Items here are (block size, type size)
@ -95,7 +94,7 @@ class Tensor:
pad = ((offset + 31) & ~31) - offset pad = ((offset + 31) & ~31) - offset
offset += pad offset += pad
n_elems = np.prod(self.dims) n_elems = np.prod(self.dims)
n_bytes = (n_elems * tysize) // blksize n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
self.start_offset = offset self.start_offset = offset
self.len_bytes = n_bytes self.len_bytes = n_bytes
offset += n_bytes offset += n_bytes
@ -327,11 +326,7 @@ def main():
data = np.memmap(cfg.input, mode = 'r') data = np.memmap(cfg.input, mode = 'r')
model = GGMLV3Model() model = GGMLV3Model()
print('* Scanning GGML input file') print('* Scanning GGML input file')
try:
offset = model.load(data, 0) offset = model.load(data, 0)
except OverflowError:
print(f'!!! Caught overflow loading tensors. The most likely issue is running on Windows but not in WSL. Try running in WSL if possible.', file = sys.stderr)
raise
print(f'* GGML model hyperparameters: {model.hyperparameters}') print(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None vocab_override = None
params_override = None params_override = None

1
convert-llama-hf-to-gguf.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
# HF llama --> gguf conversion # HF llama --> gguf conversion
import gguf import gguf

View file

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python3
import json import json
import os import os
import re import re
@ -6,23 +6,22 @@ import struct
import sys import sys
from typing import Any, Dict, Sequence, TextIO from typing import Any, Dict, Sequence, TextIO
import numpy as np
import torch import torch
from convert import DATA_TYPE_TO_FTYPE, NUMPY_TYPE_TO_DATA_TYPE, DataType NUMPY_TYPE_TO_FTYPE: Dict[str, int] = {"float32": 0, "float16": 1}
HF_SUBLAYER_TO_GGML = { HF_SUBLAYER_TO_GGML = {
"self_attn.q_proj": "attention.wq", "self_attn.q_proj": "attn_q",
"self_attn.k_proj": "attention.wk", "self_attn.k_proj": "attn_k",
"self_attn.v_proj": "attention.wv", "self_attn.v_proj": "attn_v",
"self_attn.o_proj": "attention.wo", "self_attn.o_proj": "attn_output",
"mlp.gate_proj": "feed_forward.w1", "mlp.gate_proj": "ffn_gate",
"mlp.down_proj": "feed_forward.w2", "mlp.down_proj": "ffn_down",
"mlp.up_proj": "feed_forward.w3", "mlp.up_proj": "ffn_up",
"input_layernorm": "attention_norm", "input_layernorm": "attn_norm",
"post_attention_layernorm": "ffn_norm", "post_attention_layernorm": "ffn_norm",
# "norm": "norm",
# "embed_tokens": "tok_embeddings",
# "lm_head": "output",
} }
@ -39,7 +38,7 @@ def translate_tensor_name(t: str) -> str:
sys.exit(1) sys.exit(1)
output_string = ( output_string = (
f"layers.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}" f"blk.{nn}.{HF_SUBLAYER_TO_GGML[sub_layer]}.weight.lora{lora_type}"
) )
return output_string return output_string
else: else:
@ -54,12 +53,14 @@ def write_file_header(fout: TextIO, params: Dict[str, Any]) -> None:
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int # https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
# but some models ship a float value instead # but some models ship a float value instead
# let's convert to int, but fail if lossless conversion is not possible # let's convert to int, but fail if lossless conversion is not possible
assert int(params["lora_alpha"]) == params["lora_alpha"], "cannot convert float to int losslessly" assert (
int(params["lora_alpha"]) == params["lora_alpha"]
), "cannot convert float to int losslessly"
fout.write(struct.pack("i", int(params["lora_alpha"]))) fout.write(struct.pack("i", int(params["lora_alpha"])))
def write_tensor_header( def write_tensor_header(
self, name: str, shape: Sequence[int], data_type: DataType self, name: str, shape: Sequence[int], data_type: np.dtype
) -> None: ) -> None:
sname = name.encode("utf-8") sname = name.encode("utf-8")
fout.write( fout.write(
@ -67,7 +68,7 @@ def write_tensor_header(
"iii", "iii",
len(shape), len(shape),
len(sname), len(sname),
DATA_TYPE_TO_FTYPE[NUMPY_TYPE_TO_DATA_TYPE[data_type]], NUMPY_TYPE_TO_FTYPE[data_type.name],
) )
) )
fout.write(struct.pack("i" * len(shape), *shape[::-1])) fout.write(struct.pack("i" * len(shape), *shape[::-1]))

8
convert.py Normal file → Executable file
View file

@ -1,4 +1,4 @@
#!/usr/bin/env python #!/usr/bin/env python3
import gguf import gguf
import argparse import argparse
@ -733,7 +733,11 @@ class OutputFile:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
def add_meta_arch(self, params: Params) -> None: def add_meta_arch(self, params: Params) -> None:
self.gguf.add_name ("LLaMA") ver = None
if (params.n_ctx == 4096):
ver = "v2"
self.gguf.add_name ("LLaMA" if ver == None else "LLaMA " + ver)
self.gguf.add_context_length (params.n_ctx) self.gguf.add_context_length (params.n_ctx)
self.gguf.add_embedding_length (params.n_embd) self.gguf.add_embedding_length (params.n_embd)
self.gguf.add_block_count (params.n_layer) self.gguf.add_block_count (params.n_layer)

View file

@ -12,15 +12,19 @@ usage: ./convert-llama2c-to-ggml [options]
options: options:
-h, --help show this help message and exit -h, --help show this help message and exit
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'models/ggml-vocab.bin') --copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin')
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin') --llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
``` ```
An example command is as follows: An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>` `$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin`
Now you can use the model with command like: For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually:
`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5` `$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin`
Now you can use the model with a command like:
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View file

@ -17,6 +17,9 @@
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
#endif #endif
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
#define LLAMA_FILE_VERSION_GGJT_V3 3
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc. //////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
typedef struct { typedef struct {
int dim; // transformer dimension int dim; // transformer dimension
@ -49,10 +52,10 @@ typedef struct {
// float* freq_cis_real; // (seq_len, dim/2) // float* freq_cis_real; // (seq_len, dim/2)
// float* freq_cis_imag; // (seq_len, dim/2) // float* freq_cis_imag; // (seq_len, dim/2)
// (optional) classifier weights for the logits, on the last layer // (optional) classifier weights for the logits, on the last layer
//float* wcls; float* wcls;
} TransformerWeights; } TransformerWeights;
void malloc_weights(TransformerWeights* w, Config* p) { void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy // we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = new float[p->vocab_size * p->dim](); w->token_embedding_table = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
@ -86,9 +89,16 @@ void malloc_weights(TransformerWeights* w, Config* p) {
w->rms_final_weight = new float[p->dim](); w->rms_final_weight = new float[p->dim]();
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
if (shared_weights) {
w->wcls = NULL;
} else {
w->wcls = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
}
} }
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) { int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1; if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1; if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1; if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
@ -100,6 +110,22 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1; if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1; if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1; if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
// Skip freq_cis_real & freq_cis_imag
int head_size = p->dim / p->n_heads;
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
// Check we didn't forget to read anything
auto curr = ftell(f);
fseek(f, 0, SEEK_END);
auto end = ftell(f);
if (curr != end) {
printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
return 1;
}
return 0; return 0;
} }
@ -115,6 +141,7 @@ void free_weights(TransformerWeights* w) {
delete w->w2; delete w->w2;
delete w->w3; delete w->w3;
delete w->rms_final_weight; delete w->rms_final_weight;
if (w->wcls) delete w->wcls;
} }
void print_sample_weights(TransformerWeights *w){ void print_sample_weights(TransformerWeights *w){
@ -131,6 +158,7 @@ void print_sample_weights(TransformerWeights *w){
printf("%f\n", w->w2[0]); printf("%f\n", w->w2[0]);
printf("%f\n", w->w3[0]); printf("%f\n", w->w3[0]);
printf("%f\n", w->rms_att_weight[0]); printf("%f\n", w->rms_att_weight[0]);
if (w->wcls) printf("%f\n", w->wcls[0]);
} }
//////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -509,26 +537,28 @@ bool is_ggml_file(const char *filename) {
} }
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
// heuristic to infer whether vocab is from ggml or from llama2.c vocabulary #pragma message("TODO: implement reading vocabulary using gguf")
if (is_ggml_file(filename)) { // // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
// if (is_ggml_file(filename)) {
struct llama_context_params llama_params = llama_context_default_params(); //
llama_params.vocab_only = true; // struct llama_context_params llama_params = llama_context_default_params();
// llama_params.vocab_only = true;
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params); //
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); // struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
// struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
const int n_vocab = llama_n_vocab(lctx); //
vocab->id_to_token.resize(n_vocab); // const int n_vocab = llama_n_vocab(lctx);
for (int i=0; i<n_vocab; ++i) { // vocab->id_to_token.resize(n_vocab);
vocab->id_to_token[i].text = llama_token_get_text(lctx, i); // for (int i=0; i<n_vocab; ++i) {
vocab->id_to_token[i].score = llama_token_get_score(lctx, i); // vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
vocab->id_to_token[i].type = llama_token_get_type(lctx, i); // vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
vocab->token_to_id.emplace(vocab->id_to_token[i].text, i); // vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
} // vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
llama_free(lctx); // }
llama_free_model(lmodel); // llama_free(lctx);
} else { // assume llama2.c vocabulary // llama_free_model(lmodel);
// } else
{ // assume llama2.c vocabulary
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename); printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
llama_file file(filename, "rb"); llama_file file(filename, "rb");
const int n_vocab = config->vocab_size; const int n_vocab = config->vocab_size;
@ -538,6 +568,12 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
float_t score = file.read_f32(); float_t score = file.read_f32();
uint32_t len = file.read_u32(); uint32_t len = file.read_u32();
std::string text = file.read_string(len); std::string text = file.read_string(len);
// Special-case handling of <0xXX> single byte tokens.
char byte_val;
if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
char cstr[2] = { byte_val, 0 };
text = cstr;
}
vocab->id_to_token[i].text = text; vocab->id_to_token[i].text = text;
vocab->id_to_token[i].score = score; vocab->id_to_token[i].score = score;
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED; vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
@ -589,83 +625,80 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
} }
#pragma message("TODO: implement file saving using gguf") #pragma message("TODO: implement file saving using gguf")
(void) vocab; // write_magic
(void) model; file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic
(void) w; file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version
// // write_magic // write_hparams
// file.write_u32(LLAMA_FILE_MAGIC); // magic file.write_u32(model->hparams.n_vocab);
// file.write_u32(LLAMA_FILE_VERSION); // version file.write_u32(model->hparams.n_embd);
// // write_hparams file.write_u32(model->hparams.n_mult);
// file.write_u32(model->hparams.n_vocab); file.write_u32(model->hparams.n_head);
// file.write_u32(model->hparams.n_embd); file.write_u32(model->hparams.n_layer);
// file.write_u32(model->hparams.n_mult); file.write_u32(model->hparams.n_rot);
// file.write_u32(model->hparams.n_head); file.write_u32(LLAMA_FTYPE_ALL_F32);
// file.write_u32(model->hparams.n_layer);
// file.write_u32(model->hparams.n_rot); // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
// file.write_u32(LLAMA_FTYPE_ALL_F32); uint32_t n_vocab = model->hparams.n_vocab;
// for (uint32_t i = 0; i < n_vocab; i++) {
// // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk. const auto & token_data = vocab->id_to_token.at(i);
// uint32_t n_vocab = model->hparams.n_vocab; file.write_u32((uint32_t) token_data.text.size());
// for (uint32_t i = 0; i < n_vocab; i++) { file.write_raw(token_data.text.data(), token_data.text.size());
// const auto & token_data = vocab->id_to_token.at(i); file.write_raw(&token_data.score, sizeof(token_data.score));
// file.write_u32((uint32_t) token_data.tok.size()); }
// file.write_raw(token_data.tok.data(), token_data.tok.size());
// file.write_raw(&token_data.score, sizeof(token_data.score)); // stuff AK weights into GG weights one by one.
// } // w->token_embedding_table -> model->tok_embeddings
// // float* -> struct ggml_tensor
// // stuff AK weights into GG weights one by one. stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
// // w->token_embedding_table -> model->tok_embeddings stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
// // float* -> struct ggml_tensor
// stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table); stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
// stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table); //print_row(model->norm, 0);
//
// stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight); // for rms-att-weight
// //print_row(model->norm, 0); int row_length = model->hparams.n_embd;
// const auto & hparams = model->hparams;
// // for rms-att-weight //int n_ff = model->hparams.n_embd;
// int row_length = model->hparams.n_embd; int n_ff = get_n_ff(&hparams);
// const auto & hparams = model->hparams;
// //int n_ff = model->hparams.n_embd; for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
// int n_ff = get_n_ff(&hparams); auto & layer = model->layers[i];
// // 1d
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
// auto & layer = model->layers[i]; stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]);
// // 1d
// stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]); // from 3d matrix layer x dim x dim to 2d matrix dim x dim
// stuff_karpathy_weights_into_gg(layer.ffn_norm , &w->rms_ffn_weight[i*row_length]); stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]);
// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
// // from 3d matrix layer x dim x dim to 2d matrix dim x dim stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]);
// stuff_karpathy_weights_into_gg(layer.wq , &w->wq[i*row_length*row_length]); stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]);
// stuff_karpathy_weights_into_gg(layer.wk , &w->wk[i*row_length*row_length]);
// stuff_karpathy_weights_into_gg(layer.wv , &w->wv[i*row_length*row_length]); stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]);
// stuff_karpathy_weights_into_gg(layer.wo , &w->wo[i*row_length*row_length]); stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]);
// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]);
// stuff_karpathy_weights_into_gg(layer.w1 , &w->w1[i*row_length*n_ff]); }
// stuff_karpathy_weights_into_gg(layer.w2 , &w->w2[i*n_ff*row_length]); // write tensors
// stuff_karpathy_weights_into_gg(layer.w3 , &w->w3[i*row_length*n_ff]); write_tensor(&file, model->tok_embeddings);
// } write_tensor(&file, model->norm);
// // write tensors write_tensor(&file, model->output); // ?
// write_tensor(&file, model->tok_embeddings); for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
// write_tensor(&file, model->norm); auto & layer = model->layers[i];
// write_tensor(&file, model->output); // ?
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) { write_tensor(&file, layer.attention_norm);
// auto & layer = model->layers[i]; write_tensor(&file, layer.wq);
// write_tensor(&file, layer.wk);
// write_tensor(&file, layer.attention_norm); write_tensor(&file, layer.wv);
// write_tensor(&file, layer.wq); write_tensor(&file, layer.wo);
// write_tensor(&file, layer.wk); write_tensor(&file, layer.ffn_norm);
// write_tensor(&file, layer.wv); write_tensor(&file, layer.w1);
// write_tensor(&file, layer.wo); write_tensor(&file, layer.w2);
// write_tensor(&file, layer.ffn_norm); write_tensor(&file, layer.w3);
// write_tensor(&file, layer.w1); }
// write_tensor(&file, layer.w2);
// write_tensor(&file, layer.w3);
// }
} }
struct train_params get_default_train_params() { struct train_params get_default_train_params() {
struct train_params params; struct train_params params;
params.fn_vocab_model = "models/ggml-vocab.bin"; params.fn_vocab_model = "tokenizer.bin";
params.fn_llama2c_output_model = "ak_llama_model.bin"; params.fn_llama2c_output_model = "ak_llama_model.bin";
params.fn_train_data = "shakespeare.txt"; params.fn_train_data = "shakespeare.txt";
params.fn_checkpoint_in = "checkpoint.bin"; params.fn_checkpoint_in = "checkpoint.bin";
@ -718,7 +751,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "options:\n"); fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model); fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n"); fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model); fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
fprintf(stderr, "\n"); fprintf(stderr, "\n");
@ -791,9 +824,12 @@ int main(int argc, char ** argv) {
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; } if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
// read in the config header // read in the config header
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; } if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
auto shared_weights = config.vocab_size > 0;
config.vocab_size = abs(config.vocab_size);
// read in the Transformer weights // read in the Transformer weights
malloc_weights(&weights, &config); malloc_weights(&weights, &config, shared_weights);
if(checkpoint_init_weights(&weights, &config, file)) { return 1; } if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
fclose(file); fclose(file);
} }

1
examples/embd-input/embd_input.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import ctypes import ctypes
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
import numpy as np import numpy as np

1
examples/embd-input/llava.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import sys import sys
import os import os
sys.path.insert(0, os.path.dirname(__file__)) sys.path.insert(0, os.path.dirname(__file__))

1
examples/embd-input/minigpt4.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import sys import sys
import os import os
sys.path.insert(0, os.path.dirname(__file__)) sys.path.insert(0, os.path.dirname(__file__))

1
examples/embd-input/panda_gpt.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import sys import sys
import os import os
sys.path.insert(0, os.path.dirname(__file__)) sys.path.insert(0, os.path.dirname(__file__))

1
examples/jeopardy/graph.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import os import os
import csv import csv

0
examples/jeopardy/jeopardy.sh Normal file → Executable file
View file

1
examples/json-schema-to-grammar.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import argparse import argparse
import json import json
import re import re

View file

@ -43,7 +43,7 @@ static bool is_interacting = false;
void sigint_handler(int signo) { void sigint_handler(int signo) {
if (signo == SIGINT) { if (signo == SIGINT) {
if (!is_interacting) { if (!is_interacting) {
is_interacting=true; is_interacting = true;
} else { } else {
console::cleanup(); console::cleanup();
printf("\n"); printf("\n");
@ -189,23 +189,30 @@ int main(int argc, char ** argv) {
} }
} }
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
// tokenize the prompt // tokenize the prompt
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
embd_inp = ::llama_tokenize(ctx, params.prompt, true); embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
} else { } else {
embd_inp = session_tokens; embd_inp = session_tokens;
} }
// Should not run without any tokens
if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(ctx));
}
// Tokenize negative prompt // Tokenize negative prompt
std::vector<llama_token> guidance_inp; std::vector<llama_token> guidance_inp;
int guidance_offset = 0; int guidance_offset = 0;
int original_prompt_len = 0; int original_prompt_len = 0;
if (ctx_guidance) { if (ctx_guidance) {
params.cfg_negative_prompt.insert(0, 1, ' '); params.cfg_negative_prompt.insert(0, 1, ' ');
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, true); guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true); std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
original_prompt_len = original_inp.size(); original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len; guidance_offset = (int)guidance_inp.size() - original_prompt_len;
} }
@ -252,7 +259,7 @@ int main(int argc, char ** argv) {
} }
// prefix & suffix for instruct mode // prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
// in instruct mode, we inject a prefix and a suffix to each input by the user // in instruct mode, we inject a prefix and a suffix to each input by the user

1
examples/make-ggml.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
""" """
This script converts Hugging Face llama models to GGML and quantizes them. This script converts Hugging Face llama models to GGML and quantizes them.

View file

@ -27,12 +27,136 @@ std::vector<float> softmax(const std::vector<float>& logits) {
return probs; return probs;
} }
void perplexity(llama_context * ctx, const gpt_params & params) { void perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval // BOS tokens will be added for each chunk before eval
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
if (params.ppl_stride <= 0) {
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
return;
}
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = is_spm;
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
const int calc_chunk = params.n_ctx;
fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
if (int(tokens.size()) <= calc_chunk) {
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
tokens.size(), params.n_ctx, params.ppl_stride);
return;
}
const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(ctx);
const int n_batch = params.n_batch;
int count = 0;
double nll = 0.0;
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
for (int i = 0; i < n_chunk; ++i) {
const int start = i * params.ppl_stride;
const int end = start + calc_chunk;
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
//fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
std::vector<float> logits;
const auto t_start = std::chrono::high_resolution_clock::now();
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
//fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
// save original token and restore it after eval
const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(ctx);
}
const auto batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
if (j == 0) {
tokens[batch_start] = token_org;
}
}
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
const std::vector<float> tok_logits(
logits.begin() + (j + 0) * n_vocab,
logits.begin() + (j + 1) * n_vocab);
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob);
++count;
}
// perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
} else {
printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
}
fflush(stdout);
}
printf("\n");
}
void perplexity(llama_context * ctx, const gpt_params & params) {
if (params.ppl_stride > 0) {
perplexity_v2(ctx, params);
return;
}
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
const bool add_bos = is_spm;
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
auto tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
const int n_chunk_max = tokens.size() / params.n_ctx; const int n_chunk_max = tokens.size() / params.n_ctx;
@ -63,7 +187,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
const auto token_org = tokens[batch_start]; const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk // add BOS token for the first batch of each chunk
if (j == 0) { if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(ctx); tokens[batch_start] = llama_token_bos(ctx);
} }
@ -116,7 +240,11 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
++count; ++count;
} }
// perplexity is e^(average negative log-likelihood) // perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
} else {
printf("%8d %.4lf\n", i*params.n_ctx, std::exp(nll / count));
}
fflush(stdout); fflush(stdout);
} }
printf("\n"); printf("\n");
@ -177,8 +305,10 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
size_t hs_task_count = prompt_lines.size()/6; size_t hs_task_count = prompt_lines.size()/6;
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
// This is needed as usual for LLaMA models // This is needed as usual for LLaMA models
bool prepend_bos = true; const bool add_bos = is_spm;
// Number of tasks to use when computing the score // Number of tasks to use when computing the score
if ( params.hellaswag_tasks < hs_task_count ) { if ( params.hellaswag_tasks < hs_task_count ) {
@ -234,14 +364,13 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
std::vector<float> tok_logits(n_vocab); std::vector<float> tok_logits(n_vocab);
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) { for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
// Tokenize the context to count tokens // Tokenize the context to count tokens
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, prepend_bos); std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
size_t context_size = context_embd.size(); size_t context_size = context_embd.size();
// Do the 1st ending // Do the 1st ending
// In this case we include the context when evaluating // In this case we include the context when evaluating
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], prepend_bos); auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
auto query_size = query_embd.size(); auto query_size = query_embd.size();
//printf("First query: %d\n",(int)query_size); //printf("First query: %d\n",(int)query_size);
@ -369,6 +498,12 @@ int main(int argc, char ** argv) {
params.perplexity = true; params.perplexity = true;
params.n_batch = std::min(params.n_batch, params.n_ctx); params.n_batch = std::min(params.n_batch, params.n_ctx);
if (params.ppl_stride > 0) {
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
params.n_ctx, params.n_ctx + params.ppl_stride/2);
params.n_ctx += params.ppl_stride/2;
}
if (params.n_ctx > 2048) { if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);" fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx); "expect poor results\n", __func__, params.n_ctx);

View file

@ -12,25 +12,25 @@ struct quant_option {
}; };
static const std::vector<struct quant_option> QUANT_OPTIONS = { static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", }, { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", }, { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", }, { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", }, { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
#ifdef GGML_USE_K_QUANTS #ifdef GGML_USE_K_QUANTS
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", }, { "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", }, { "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", }, { "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", }, { "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", }, { "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", }, { "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", }, { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
#endif #endif
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
}; };

View file

@ -1,4 +1,3 @@
#!/bin/bash #!/bin/bash
cd `dirname $0` cd `dirname $0`

0
examples/server-llama2-13B.sh Normal file → Executable file
View file

View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import argparse import argparse
from flask import Flask, jsonify, request, Response from flask import Flask, jsonify, request, Response
import urllib.parse import urllib.parse

0
examples/server/chat-llama2.sh Normal file → Executable file
View file

0
examples/server/chat.sh Normal file → Executable file
View file

View file

@ -238,7 +238,7 @@ static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_t
alloc->n_free_blocks++; alloc->n_free_blocks++;
} }
void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) { void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n) {
int pos = 0; int pos = 0;
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
if (list[i] != -1) { if (list[i] != -1) {
@ -547,7 +547,7 @@ static size_t ggml_allocator_alloc_graph_tensors_n(
struct ggml_tensor * view_src = get_view_source(parent); struct ggml_tensor * view_src = get_view_source(parent);
struct hash_node * view_src_hn = hash_get(ht, view_src); struct hash_node * view_src_hn = hash_get(ht, view_src);
view_src_hn->n_views -= 1; view_src_hn->n_views -= 1;
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views); AT_PRINTF("view_src %s\n", view_src->name);
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) { if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
ggml_allocator_free_tensor(alloc, view_src); ggml_allocator_free_tensor(alloc, view_src);
} }

View file

@ -12,7 +12,7 @@ GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
// tell the allocator to parse nodes following the order described in the list // tell the allocator to parse nodes following the order described in the list
// you should call this if your graph are optimized to execute out-of-order // you should call this if your graph are optimized to execute out-of-order
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n); GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);

View file

@ -3907,6 +3907,29 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
dst[i + 1] = x0*sin_theta + x1*cos_theta; dst[i + 1] = x0*sin_theta + x1*cos_theta;
} }
// TODO: this implementation is wrong!
//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
// const float p_delta, const int p_delta_rows, const float theta_scale) {
// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
//
// if (col >= ncols) {
// return;
// }
//
// const int row = blockDim.x*blockIdx.x + threadIdx.x;
// const int i = row*ncols + col/2;
//
// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
// const float sin_theta = sinf(theta);
// const float cos_theta = cosf(theta);
//
// const float x0 = x[i + 0];
// const float x1 = x[i + ncols/2];
//
// dst[i + 0] = x0*cos_theta - x1*sin_theta;
// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
//}
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) { static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
const int col = blockDim.x*blockIdx.x + threadIdx.x; const int col = blockDim.x*blockIdx.x + threadIdx.x;
const int half_n_dims = ncols/4; const int half_n_dims = ncols/4;
@ -5511,6 +5534,7 @@ inline void ggml_cuda_op_rope(
const float theta_scale = powf(freq_base, -2.0f/n_dims); const float theta_scale = powf(freq_base, -2.0f/n_dims);
const bool is_neox = mode & 2;
const bool is_glm = mode & 4; const bool is_glm = mode & 4;
// compute // compute
@ -5519,6 +5543,9 @@ inline void ggml_cuda_op_rope(
const float id_p = min(p, n_ctx - 2.f); const float id_p = min(p, n_ctx - 2.f);
const float block_p = max(p - (n_ctx - 2.f), 0.f); const float block_p = max(p - (n_ctx - 2.f), 0.f);
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main); rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
} else if (is_neox) {
GGML_ASSERT(false && "RoPE NeoX not implemented yet");
#pragma message("TODO: implement RoPE NeoX for CUDA")
} else { } else {
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);

View file

@ -167,7 +167,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
#define GGML_METAL_ADD_KERNEL(name) \ #define GGML_METAL_ADD_KERNEL(name) \
ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \ ctx->function_##name = [ctx->library newFunctionWithName:@"kernel_"#name]; \
ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \ ctx->pipeline_##name = [ctx->device newComputePipelineStateWithFunction:ctx->function_##name error:&error]; \
fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); \ fprintf(stderr, "%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name, \
(int) ctx->pipeline_##name.maxTotalThreadsPerThreadgroup, \
(int) ctx->pipeline_##name.threadExecutionWidth); \
if (error) { \ if (error) { \
fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \ fprintf(stderr, "%s: load pipeline error: %s\n", __func__, [[error description] UTF8String]); \
return NULL; \ return NULL; \
@ -538,7 +540,7 @@ void ggml_metal_graph_compute(
id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc]; id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
const int node_start = (cb_idx + 0) * n_nodes_per_cb; const int node_start = (cb_idx + 0) * n_nodes_per_cb;
const int node_end = (cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb; const int node_end = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
for (int ind = node_start; ind < node_end; ++ind) { for (int ind = node_start; ind < node_end; ++ind) {
const int i = has_concur ? ctx->concur_list[ind] : ind; const int i = has_concur ? ctx->concur_list[ind] : ind;
@ -768,8 +770,7 @@ void ggml_metal_graph_compute(
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10]; [encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
[encoder setThreadgroupMemoryLength:8192 atIndex:0]; [encoder setThreadgroupMemoryLength:8192 atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
} } else {
else {
int nth0 = 32; int nth0 = 32;
int nth1 = 1; int nth1 = 1;
@ -872,20 +873,20 @@ void ggml_metal_graph_compute(
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q3_K) { else if (src0t == GGML_TYPE_Q3_K) {
#ifdef GGML_QKK_64 #ifdef GGML_QKK_64
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#else #else
[encoder dispatchThreadgroups:MTLSizeMake((ne01+3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
#endif #endif
} }
else if (src0t == GGML_TYPE_Q5_K) { else if (src0t == GGML_TYPE_Q5_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} }
else if (src0t == GGML_TYPE_Q6_K) { else if (src0t == GGML_TYPE_Q6_K) {
[encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
} else { } else {
[encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:nth0*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
@ -938,7 +939,8 @@ void ggml_metal_graph_compute(
} break; } break;
case GGML_OP_NORM: case GGML_OP_NORM:
{ {
const float eps = 1e-5f; float eps;
memcpy(&eps, dst->op_params, sizeof(float));
const int nth = 256; const int nth = 256;
@ -990,7 +992,9 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&m0 length:sizeof( float) atIndex:18]; [encoder setBytes:&m0 length:sizeof( float) atIndex:18];
const int nth = 32; const int nth = 32;
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_ROPE: case GGML_OP_ROPE:

View file

@ -87,7 +87,12 @@ kernel void kernel_gelu(
device float * dst, device float * dst,
uint tpig[[thread_position_in_grid]]) { uint tpig[[thread_position_in_grid]]) {
float x = src0[tpig]; float x = src0[tpig];
dst[tpig] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
// BEWARE !!!
// Simply using "tanh" instead of "precise::tanh" will sometimes results in NaNs!
// This was observed with Falcon 7B and 40B models
//
dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
} }
kernel void kernel_soft_max( kernel void kernel_soft_max(
@ -571,7 +576,25 @@ kernel void kernel_rope(
dst_data[1] = x0*sin_theta + x1*cos_theta; dst_data[1] = x0*sin_theta + x1*cos_theta;
} }
} else { } else {
// TODO: implement for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) {
const float cos_theta = cos(theta);
const float sin_theta = sin(theta);
theta *= theta_scale;
const int64_t i0 = ib*n_dims + ic/2;
device const float * const src = (device float *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
device float * dst_data = (device float *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float x0 = src[0];
const float x1 = src[n_dims/2];
dst_data[0] = x0*cos_theta - x1*sin_theta;
dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
}
}
} }
} }

24
ggml.c
View file

@ -5556,10 +5556,6 @@ struct ggml_tensor * ggml_repeat(
is_node = true; is_node = true;
} }
if (ggml_are_same_shape(a, b) && !is_node) {
return a;
}
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne); struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
result->op = GGML_OP_REPEAT; result->op = GGML_OP_REPEAT;
@ -5790,6 +5786,7 @@ struct ggml_tensor * ggml_silu_back(
static struct ggml_tensor * ggml_norm_impl( static struct ggml_tensor * ggml_norm_impl(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
float eps,
bool inplace) { bool inplace) {
bool is_node = false; bool is_node = false;
@ -5800,7 +5797,7 @@ static struct ggml_tensor * ggml_norm_impl(
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
// TODO: maybe store epsilon here? ggml_set_op_params(result, &eps, sizeof(eps));
result->op = GGML_OP_NORM; result->op = GGML_OP_NORM;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@ -5811,14 +5808,16 @@ static struct ggml_tensor * ggml_norm_impl(
struct ggml_tensor * ggml_norm( struct ggml_tensor * ggml_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a) { struct ggml_tensor * a,
return ggml_norm_impl(ctx, a, false); float eps) {
return ggml_norm_impl(ctx, a, eps, false);
} }
struct ggml_tensor * ggml_norm_inplace( struct ggml_tensor * ggml_norm_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a) { struct ggml_tensor * a,
return ggml_norm_impl(ctx, a, true); float eps) {
return ggml_norm_impl(ctx, a, eps, true);
} }
// ggml_rms_norm // ggml_rms_norm
@ -10620,7 +10619,8 @@ static void ggml_compute_forward_norm_f32(
GGML_TENSOR_UNARY_OP_LOCALS; GGML_TENSOR_UNARY_OP_LOCALS;
const float eps = 1e-5f; // TODO: make this a parameter float eps;
memcpy(&eps, dst->op_params, sizeof(float));
// TODO: optimize // TODO: optimize
for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -12533,7 +12533,7 @@ static void ggml_compute_forward_rope_f32(
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta; dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
} }
} else { } else {
// TODO: this is probably wrong, but I can't figure it out .. // TODO: this might be wrong for ne0 != n_dims - need double check
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) { for (int64_t ic = 0; ic < n_dims; ic += 2) {
@ -12662,7 +12662,7 @@ static void ggml_compute_forward_rope_f16(
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta); dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
} }
} else { } else {
// TODO: this is probably wrong, but I can't figure it out .. // TODO: this might be wrong for ne0 != n_dims - need double check
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28 // ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) { for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) { for (int64_t ic = 0; ic < n_dims; ic += 2) {

7
ggml.h
View file

@ -909,14 +909,15 @@ extern "C" {
struct ggml_tensor * b); struct ggml_tensor * b);
// normalize along rows // normalize along rows
// TODO: eps is hardcoded to 1e-5 for now
GGML_API struct ggml_tensor * ggml_norm( GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a,
float eps);
GGML_API struct ggml_tensor * ggml_norm_inplace( GGML_API struct ggml_tensor * ggml_norm_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a,
float eps);
GGML_API struct ggml_tensor * ggml_rms_norm( GGML_API struct ggml_tensor * ggml_rms_norm(
struct ggml_context * ctx, struct ggml_context * ctx,

27
gguf.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import shutil import shutil
import sys import sys
import struct import struct
@ -29,12 +30,12 @@ KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"
KEY_GENERAL_FILE_TYPE = "general.file_type" KEY_GENERAL_FILE_TYPE = "general.file_type"
# LLM # LLM
KEY_LLM_CONTEXT_LENGTH = "{arch}.context_length" KEY_CONTEXT_LENGTH = "{arch}.context_length"
KEY_LLM_EMBEDDING_LENGTH = "{arch}.embedding_length" KEY_EMBEDDING_LENGTH = "{arch}.embedding_length"
KEY_LLM_BLOCK_COUNT = "{arch}.block_count" KEY_BLOCK_COUNT = "{arch}.block_count"
KEY_LLM_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" KEY_FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
KEY_LLM_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual" KEY_USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
KEY_LLM_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout" KEY_TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
# attention # attention
KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count" KEY_ATTENTION_HEAD_COUNT = "{arch}.attention.head_count"
@ -582,7 +583,7 @@ class GGUFWriter:
self.add_string(KEY_GENERAL_AUTHOR, author) self.add_string(KEY_GENERAL_AUTHOR, author)
def add_tensor_data_layout(self, layout: str): def add_tensor_data_layout(self, layout: str):
self.add_string(KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) self.add_string(KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
def add_url(self, url: str): def add_url(self, url: str):
self.add_string(KEY_GENERAL_URL, url) self.add_string(KEY_GENERAL_URL, url)
@ -612,27 +613,27 @@ class GGUFWriter:
def add_context_length(self, length: int): def add_context_length(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_CONTEXT_LENGTH.format(arch=self.arch), length) KEY_CONTEXT_LENGTH.format(arch=self.arch), length)
def add_embedding_length(self, length: int): def add_embedding_length(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_EMBEDDING_LENGTH.format(arch=self.arch), length) KEY_EMBEDDING_LENGTH.format(arch=self.arch), length)
def add_block_count(self, length: int): def add_block_count(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_BLOCK_COUNT.format(arch=self.arch), length) KEY_BLOCK_COUNT.format(arch=self.arch), length)
def add_feed_forward_length(self, length: int): def add_feed_forward_length(self, length: int):
self.add_uint32( self.add_uint32(
KEY_LLM_FEED_FORWARD_LENGTH.format(arch=self.arch), length) KEY_FEED_FORWARD_LENGTH.format(arch=self.arch), length)
def add_parallel_residual(self, use: bool): def add_parallel_residual(self, use: bool):
self.add_bool( self.add_bool(
KEY_LLM_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use) KEY_USE_PARALLEL_RESIDUAL.format(arch=self.arch), use)
def add_tensor_data_layout(self, layout: str): def add_tensor_data_layout(self, layout: str):
self.add_string( self.add_string(
KEY_LLM_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) KEY_TENSOR_DATA_LAYOUT.format(arch=self.arch), layout)
def add_head_count(self, count: int): def add_head_count(self, count: int):
self.add_uint32( self.add_uint32(

1572
llama.cpp

File diff suppressed because it is too large Load diff

15
llama.h
View file

@ -247,6 +247,8 @@ extern "C" {
LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx); LLAMA_API int llama_n_embd (const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx);
LLAMA_API int llama_model_n_vocab(const struct llama_model * model); LLAMA_API int llama_model_n_vocab(const struct llama_model * model);
LLAMA_API int llama_model_n_ctx (const struct llama_model * model); LLAMA_API int llama_model_n_ctx (const struct llama_model * model);
LLAMA_API int llama_model_n_embd (const struct llama_model * model); LLAMA_API int llama_model_n_embd (const struct llama_model * model);
@ -368,13 +370,6 @@ extern "C" {
int n_max_tokens, int n_max_tokens,
bool add_bos); bool add_bos);
LLAMA_API int llama_tokenize_bpe(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_tokenize_with_model( LLAMA_API int llama_tokenize_with_model(
const struct llama_model * model, const struct llama_model * model,
const char * text, const char * text,
@ -390,12 +385,6 @@ extern "C" {
char * buf, char * buf,
int length); int length);
LLAMA_API int llama_token_to_str_bpe(
const struct llama_context * ctx,
llama_token token,
char * buf,
int length);
LLAMA_API int llama_token_to_str_with_model( LLAMA_API int llama_token_to_str_with_model(
const struct llama_model * model, const struct llama_model * model,
llama_token token, llama_token token,

View file

@ -473,7 +473,7 @@ bool gpt2_eval(
// norm // norm
{ {
// [ 768, N] // [ 768, N]
cur = ggml_norm(ctx0, inpL); cur = ggml_norm(ctx0, inpL, default_norm_eps);
// cur = ln_1_g*cur + ln_1_b // cur = ln_1_g*cur + ln_1_b
// [ 768, N] // [ 768, N]
@ -624,7 +624,7 @@ bool gpt2_eval(
{ {
// norm // norm
{ {
cur = ggml_norm(ctx0, inpFF); cur = ggml_norm(ctx0, inpFF, default_norm_eps);
// cur = ln_2_g*cur + ln_2_b // cur = ln_2_g*cur + ln_2_b
// [ 768, N] // [ 768, N]
@ -683,7 +683,7 @@ bool gpt2_eval(
// norm // norm
{ {
// [ 768, N] // [ 768, N]
inpL = ggml_norm(ctx0, inpL); inpL = ggml_norm(ctx0, inpL, default_norm_eps);
// inpL = ln_f_g*inpL + ln_f_b // inpL = ln_f_g*inpL + ln_f_b
// [ 768, N] // [ 768, N]

View file

@ -464,7 +464,7 @@ bool gptj_eval(
// norm // norm
{ {
cur = ggml_norm(ctx0, inpL); cur = ggml_norm(ctx0, inpL, default_norm_eps);
// cur = ln_1_g*cur + ln_1_b // cur = ln_1_g*cur + ln_1_b
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
@ -594,7 +594,7 @@ bool gptj_eval(
// norm // norm
{ {
inpL = ggml_norm(ctx0, inpL); inpL = ggml_norm(ctx0, inpL, default_norm_eps);
// inpL = ln_f_g*inpL + ln_f_b // inpL = ln_f_g*inpL + ln_f_b
inpL = ggml_add(ctx0, inpL = ggml_add(ctx0,

View file

@ -399,7 +399,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
// a = self.ln_1(x) // a = self.ln_1(x)
{ {
cur = ggml_norm(ctx0, inpL); cur = ggml_norm(ctx0, inpL, default_norm_eps);
cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur); cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_1_weight, cur), cur);
} }
@ -497,7 +497,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
// m = self.ln_2(x) // m = self.ln_2(x)
{ {
cur = ggml_norm(ctx0, inpL); cur = ggml_norm(ctx0, inpL, default_norm_eps);
cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur); cur = ggml_mul(ctx0, ggml_repeat(ctx0, model.layers[il].norm_2_weight, cur), cur);
} }
@ -525,7 +525,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
// norm // norm
{ {
inpL = ggml_norm(ctx0, inpL); inpL = ggml_norm(ctx0, inpL, default_norm_eps);
// inpL = ln_f_g*inpL // inpL = ln_f_g*inpL
inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL); inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
} }

View file

@ -367,7 +367,7 @@ ggml_tensor * gpt_neox_ff(
const gpt_neox_layer &layer, const gpt_neox_layer &layer,
ggml_context * ctx0, ggml_context * ctx0,
ggml_tensor * inp) { ggml_tensor * inp) {
ggml_tensor * cur = ggml_norm(ctx0, inp); ggml_tensor * cur = ggml_norm(ctx0, inp, default_norm_eps);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
ggml_mul(ctx0, ggml_mul(ctx0,
@ -481,7 +481,7 @@ bool gpt_neox_eval(
// self-attention // self-attention
{ {
{ {
cur = ggml_norm(ctx0, inpL); cur = ggml_norm(ctx0, inpL, default_norm_eps);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
ggml_mul(ctx0, ggml_mul(ctx0,
@ -613,7 +613,7 @@ bool gpt_neox_eval(
// norm // norm
{ {
inpL = ggml_norm(ctx0, inpL); inpL = ggml_norm(ctx0, inpL, default_norm_eps);
// inpL = ln_f_g*inpL + ln_f_b // inpL = ln_f_g*inpL + ln_f_b
inpL = ggml_add(ctx0, inpL = ggml_add(ctx0,

View file

@ -458,4 +458,4 @@ struct mpt_model {
std::map<std::string, struct ggml_tensor *> tensors; std::map<std::string, struct ggml_tensor *> tensors;
}; };
const float default_norm_eps = 1e-5f;

View file

@ -477,7 +477,7 @@ struct ggml_tensor * rwkv_max(ggml_context * ctx, struct ggml_tensor * x, struct
struct ggml_tensor * rwkv_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight, struct ggml_tensor * bias) { struct ggml_tensor * rwkv_layer_norm(ggml_context * ctx, struct ggml_tensor * x, struct ggml_tensor * weight, struct ggml_tensor * bias) {
// LayerNorm in RWKV is `x = (x - mean(x)) / sqrt(variance(x) + 1e-5) * weight + bias` // LayerNorm in RWKV is `x = (x - mean(x)) / sqrt(variance(x) + 1e-5) * weight + bias`
// Looks like ggml_norm does the first part, we only need to apply weight & bias. // Looks like ggml_norm does the first part, we only need to apply weight & bias.
return ggml_add_inplace(ctx, ggml_mul_inplace(ctx, ggml_norm(ctx, x), weight), bias); return ggml_add_inplace(ctx, ggml_mul_inplace(ctx, ggml_norm(ctx, x, default_norm_eps), weight), bias);
} }
// --- Implementation --- // --- Implementation ---

Binary file not shown.

Binary file not shown.

0
scripts/get-wikitext-2.sh Normal file → Executable file
View file

View file

@ -67,11 +67,13 @@ int main(int argc, char **argv) {
} }
} }
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE);
const int n_vocab = llama_n_vocab(ctx); const int n_vocab = llama_n_vocab(ctx);
for (int i = 0; i < n_vocab; ++i) { for (int i = 0; i < n_vocab; ++i) {
std::string forward = llama_token_to_str_bpe(ctx, i); std::string forward = llama_token_to_str(ctx, i);
std::vector<llama_token> tokens = llama_tokenize_bpe(ctx, forward, false); std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
if (tokens.size() == 1) { if (tokens.size() == 1) {
if (i != tokens[0]) { if (i != tokens[0]) {
std::string backward = llama_token_to_str(ctx, tokens[0]); std::string backward = llama_token_to_str(ctx, tokens[0]);
@ -79,16 +81,6 @@ int main(int argc, char **argv) {
__func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str()); __func__, i, llama_token_to_str(ctx, i).c_str(), tokens[0], backward.c_str());
return 2; return 2;
} }
} else {
llama_token_type type = llama_token_get_type(ctx, i);
if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) {
fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
} else {
fprintf(stderr, "%s : error: token %d is string %s but bpe returns tokens %s\n",
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
return 2;
}
} }
} }