Merge branch 'master' into master
This commit is contained in:
commit
16f8bba496
31 changed files with 2345 additions and 1986 deletions
33
.github/workflows/build.yml
vendored
33
.github/workflows/build.yml
vendored
|
@ -32,6 +32,8 @@ jobs:
|
|||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
|
@ -88,6 +90,8 @@ jobs:
|
|||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
|
@ -206,6 +210,8 @@ jobs:
|
|||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
|
@ -238,6 +244,33 @@ jobs:
|
|||
./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf
|
||||
./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER="$(git rev-list --count HEAD)"
|
||||
SHORT_HASH="$(git rev-parse --short=7 HEAD)"
|
||||
if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
|
||||
else
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
cp LICENSE ./build/bin/
|
||||
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
|
||||
name: llama-bin-ubuntu-x64.zip
|
||||
|
||||
# ubuntu-latest-cmake-sanitizer:
|
||||
# runs-on: ubuntu-latest
|
||||
#
|
||||
|
|
|
@ -230,11 +230,10 @@ source /opt/intel/oneapi/setvars.sh
|
|||
mkdir -p build && cd build
|
||||
|
||||
# Option 1: Use FP16 for better performance in long-prompt inference
|
||||
cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||
# Or without "--build", run "make" next
|
||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||
|
||||
# Option 2: Use FP32 by default
|
||||
cmake --build .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -j -v
|
||||
|
@ -252,10 +251,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
|||
mkdir -p build && cd build
|
||||
|
||||
# Option 1: Use FP16 for better performance in long-prompt inference
|
||||
cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
||||
|
||||
# Option 2: Use FP32 by default
|
||||
cmake --build .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -j -v
|
||||
|
|
|
@ -122,6 +122,7 @@ Typically finetunes of the base models below are supported as well.
|
|||
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
|
||||
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
|
||||
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
|
||||
- [x] [OLMo](https://allenai.org/olmo)
|
||||
|
||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
||||
|
||||
|
|
|
@ -108,7 +108,7 @@ int32_t get_num_physical_cores() {
|
|||
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
||||
}
|
||||
|
||||
#if defined(__x86_64__) && defined(__linux__)
|
||||
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
|
||||
#include <pthread.h>
|
||||
|
||||
static void cpuid(unsigned leaf, unsigned subleaf,
|
||||
|
@ -162,7 +162,7 @@ static int count_math_cpus(int cpu_count) {
|
|||
* Returns number of CPUs on system that are useful for math.
|
||||
*/
|
||||
int get_math_cpu_count() {
|
||||
#if defined(__x86_64__) && defined(__linux__)
|
||||
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
|
||||
int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
|
||||
if (cpu_count < 1) {
|
||||
return get_num_physical_cores();
|
||||
|
|
|
@ -1301,10 +1301,18 @@ class LlamaModel(Model):
|
|||
try:
|
||||
self. _set_vocab_sentencepiece()
|
||||
except FileNotFoundError:
|
||||
try:
|
||||
self._set_vocab_llama_hf()
|
||||
except (FileNotFoundError, TypeError):
|
||||
# Llama 3
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
||||
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
|
||||
# Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256)
|
||||
if self.hparams.get("vocab_size", 32000) == 32016:
|
||||
special_vocab = gguf.SpecialVocab(
|
||||
self.dir_model, load_merges=False,
|
||||
special_token_types = ['prefix', 'suffix', 'middle', 'eot']
|
||||
)
|
||||
special_vocab._set_special_token("prefix", 32007)
|
||||
special_vocab._set_special_token("suffix", 32008)
|
||||
special_vocab._set_special_token("middle", 32009)
|
||||
|
@ -2194,6 +2202,8 @@ class InternLM2Model(Model):
|
|||
old_eos = special_vocab.special_token_ids["eos"]
|
||||
if "chat" in os.path.basename(self.dir_model.absolute()):
|
||||
# For the chat model, we replace the eos with '<|im_end|>'.
|
||||
# TODO: this is a hack, should be fixed
|
||||
# https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
|
||||
special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer)
|
||||
print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \
|
||||
in chat mode so that the conversation can end normally.")
|
||||
|
@ -2429,12 +2439,15 @@ class GemmaModel(Model):
|
|||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
# TODO: these special tokens should be exported only for the CodeGemma family
|
||||
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False,
|
||||
special_token_types = ['prefix', 'suffix', 'middle', 'eot'])
|
||||
special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot'])
|
||||
special_vocab._set_special_token("prefix", 67)
|
||||
special_vocab._set_special_token("suffix", 69)
|
||||
special_vocab._set_special_token("middle", 68)
|
||||
special_vocab._set_special_token("eot", 70)
|
||||
special_vocab._set_special_token("fsep", 70)
|
||||
special_vocab._set_special_token("eot", 107)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
|
@ -2523,16 +2536,22 @@ class MambaModel(Model):
|
|||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
|
||||
self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
|
||||
self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
|
||||
self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
|
||||
self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
|
||||
self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
|
||||
self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
|
||||
|
||||
field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
|
||||
self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
|
||||
|
||||
|
@ -2636,6 +2655,66 @@ class CommandR2Model(Model):
|
|||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||
|
||||
|
||||
@Model.register("OlmoForCausalLM")
|
||||
@Model.register("OLMoForCausalLM")
|
||||
class OlmoModel(Model):
|
||||
model_arch = gguf.MODEL_ARCH.OLMO
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||
if "clip_qkv" in self.hparams is not None:
|
||||
self.gguf_writer.add_clamp_kqv(self.hparams["clip_qkv"])
|
||||
|
||||
# Same as super class, but permuting q_proj, k_proj
|
||||
# Copied from: LlamaModel
|
||||
def write_tensors(self):
|
||||
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||
n_head = self.hparams.get("num_attention_heads")
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
for name, data_torch in self.get_tensors():
|
||||
old_dtype = data_torch.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
data = data_torch.numpy()
|
||||
|
||||
if name.endswith("q_proj.weight"):
|
||||
data = permute(data, n_head, n_head)
|
||||
if name.endswith("k_proj.weight"):
|
||||
data = permute(data, n_head, n_kv_head)
|
||||
|
||||
data = data.squeeze()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if self.ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# 1d tensors need to be converted to float32
|
||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
|
|
@ -525,7 +525,14 @@ class LlamaHfVocab(Vocab):
|
|||
|
||||
# pre-check so we know if we need transformers
|
||||
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||
if (
|
||||
is_llama3 = (
|
||||
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
|
||||
and not tokenizer_model.get('byte_fallback', True)
|
||||
)
|
||||
if is_llama3:
|
||||
raise TypeError('Llama 3 must be converted with BpeVocab')
|
||||
|
||||
if not is_llama3 and (
|
||||
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
||||
or tokenizer_json['decoder']['type'] != 'Sequence'
|
||||
):
|
||||
|
|
|
@ -153,7 +153,7 @@ while n_cur <= n_len {
|
|||
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
|
||||
// is it an end of stream? -> mark the stream as finished
|
||||
if new_token_id == llama_token_eos(model) || n_cur == n_len {
|
||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||
i_batch[i] = -1
|
||||
// print("")
|
||||
if n_parallel > 1 {
|
||||
|
|
|
@ -191,8 +191,8 @@ int main(int argc, char ** argv) {
|
|||
|
||||
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
|
||||
// is it an end of stream? -> mark the stream as finished
|
||||
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
|
||||
// is it an end of generation? -> mark the stream as finished
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||
i_batch[i] = -1;
|
||||
LOG_TEE("\n");
|
||||
if (n_parallel > 1) {
|
||||
|
|
|
@ -47,7 +47,7 @@ struct beam_search_callback_data {
|
|||
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
||||
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
||||
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
|
||||
return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
|
||||
}
|
||||
|
||||
// Function matching type llama_beam_search_callback_fn_t.
|
||||
|
|
|
@ -651,8 +651,8 @@ int main(int argc, char ** argv) {
|
|||
// LOG_TEE("took new input\n");
|
||||
is_interacting = false;
|
||||
}
|
||||
// deal with end of text token in interactive mode
|
||||
else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
|
||||
// deal with end of generation tokens in interactive mode
|
||||
else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
|
||||
LOG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
|
@ -731,8 +731,8 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
// end of text token
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
|
||||
// end of generation
|
||||
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) {
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
@ -408,7 +408,7 @@ Java_com_example_llama_Llm_completion_1loop(
|
|||
const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
|
||||
|
||||
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||
return env->NewStringUTF("");
|
||||
}
|
||||
|
||||
|
|
|
@ -158,7 +158,7 @@ actor LlamaContext {
|
|||
new_token_id = llama_sample_token_greedy(context, &candidates_p)
|
||||
}
|
||||
|
||||
if new_token_id == llama_token_eos(model) || n_cur == n_len {
|
||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||
print("\n")
|
||||
let new_token_str = String(cString: temporary_invalid_cchars + [0])
|
||||
temporary_invalid_cchars.removeAll()
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
// I'll gradually clean and extend it
|
||||
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
|
||||
#include "clip.h"
|
||||
#include "log.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
|
@ -23,7 +24,6 @@
|
|||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <stdexcept>
|
||||
|
@ -145,7 +145,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
|||
static int get_key_idx(const gguf_context * ctx, const char * key) {
|
||||
int i = gguf_find_key(ctx, key);
|
||||
if (i == -1) {
|
||||
fprintf(stderr, "key %s not found in file\n", key);
|
||||
LOG_TEE("key %s not found in file\n", key);
|
||||
throw std::runtime_error(format("Missing required key: %s", key));
|
||||
}
|
||||
|
||||
|
@ -247,7 +247,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
|||
|
||||
static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") {
|
||||
size_t tensor_size = ggml_nbytes(tensor);
|
||||
printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
||||
LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
||||
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
||||
}
|
||||
|
@ -265,7 +265,7 @@ static projector_type clip_projector_type_from_string(const std::string & name)
|
|||
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
|
||||
std::ofstream file(filename, std::ios::binary);
|
||||
if (!file.is_open()) {
|
||||
std::cerr << "Failed to open file for writing: " << filename << std::endl;
|
||||
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -284,7 +284,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s
|
|||
static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) {
|
||||
std::ofstream file(filename, std::ios::binary);
|
||||
if (!file.is_open()) {
|
||||
std::cerr << "Failed to open file for writing: " << filename << std::endl;
|
||||
LOG_TEE("Failed to open file for writing: %s\n", filename.c_str());
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -515,7 +515,7 @@ struct clip_ctx {
|
|||
|
||||
static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
@ -879,21 +879,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
const int idx_name = gguf_find_key(ctx, KEY_NAME);
|
||||
if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug
|
||||
const std::string name = gguf_get_val_str(ctx, idx_name);
|
||||
printf("%s: model name: %s\n", __func__, name.c_str());
|
||||
LOG_TEE("%s: model name: %s\n", __func__, name.c_str());
|
||||
}
|
||||
printf("%s: description: %s\n", __func__, description.c_str());
|
||||
printf("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
||||
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||
printf("%s: n_tensors: %d\n", __func__, n_tensors);
|
||||
printf("%s: n_kv: %d\n", __func__, n_kv);
|
||||
printf("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
||||
printf("\n");
|
||||
LOG_TEE("%s: description: %s\n", __func__, description.c_str());
|
||||
LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx));
|
||||
LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||
LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors);
|
||||
LOG_TEE("%s: n_kv: %d\n", __func__, n_kv);
|
||||
LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str());
|
||||
LOG_TEE("\n");
|
||||
}
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
// kv
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
||||
LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
||||
__func__, n_kv, n_tensors, fname);
|
||||
{
|
||||
std::map<enum ggml_type, uint32_t> n_type;
|
||||
|
@ -904,7 +904,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
n_type[type]++;
|
||||
}
|
||||
|
||||
printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||
LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * name = gguf_get_key(ctx, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
||||
|
@ -920,7 +920,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
}
|
||||
replace_all(value, "\n", "\\n");
|
||||
|
||||
printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||
LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||
}
|
||||
|
||||
// print type counts
|
||||
|
@ -929,7 +929,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
continue;
|
||||
}
|
||||
|
||||
printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
||||
LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -944,7 +944,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
size_t tensor_size = ggml_nbytes(cur);
|
||||
model_size += tensor_size;
|
||||
if (verbosity >= 3) {
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
||||
LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
||||
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
||||
}
|
||||
}
|
||||
|
@ -971,18 +971,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
|
||||
#ifdef GGML_USE_CUDA
|
||||
new_clip->backend = ggml_backend_cuda_init(0);
|
||||
printf("%s: CLIP using CUDA backend\n", __func__);
|
||||
LOG_TEE("%s: CLIP using CUDA backend\n", __func__);
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
new_clip->backend = ggml_backend_metal_init();
|
||||
printf("%s: CLIP using Metal backend\n", __func__);
|
||||
LOG_TEE("%s: CLIP using Metal backend\n", __func__);
|
||||
#endif
|
||||
|
||||
|
||||
if (!new_clip->backend) {
|
||||
new_clip->backend = ggml_backend_cpu_init();
|
||||
printf("%s: CLIP using CPU backend\n", __func__);
|
||||
LOG_TEE("%s: CLIP using CPU backend\n", __func__);
|
||||
}
|
||||
|
||||
// model size and capabilities
|
||||
|
@ -1006,15 +1006,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
new_clip->use_gelu = gguf_get_val_bool(ctx, idx);
|
||||
|
||||
if (verbosity >= 1) {
|
||||
printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||
printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||
printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||
printf("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||
printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||
LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder);
|
||||
LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder);
|
||||
LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector);
|
||||
LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0);
|
||||
LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
|
||||
printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
||||
LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors);
|
||||
|
||||
// load tensors
|
||||
{
|
||||
|
@ -1027,7 +1027,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
|
||||
new_clip->ctx_data = ggml_init(params);
|
||||
if (!new_clip->ctx_data) {
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
LOG_TEE("%s: ggml_init() failed\n", __func__);
|
||||
clip_free(new_clip);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
|
@ -1035,7 +1035,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
printf("cannot open model file for loading tensors\n");
|
||||
LOG_TEE("cannot open model file for loading tensors\n");
|
||||
clip_free(new_clip);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
|
@ -1057,7 +1057,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i);
|
||||
fin.seekg(offset, std::ios::beg);
|
||||
if (!fin) {
|
||||
printf("%s: failed to seek for tensor %s\n", __func__, name);
|
||||
LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name);
|
||||
clip_free(new_clip);
|
||||
gguf_free(ctx);
|
||||
return nullptr;
|
||||
|
@ -1128,23 +1128,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
}
|
||||
|
||||
if (verbosity >= 2) {
|
||||
printf("\n%s: vision model hparams\n", __func__);
|
||||
printf("image_size %d\n", hparams.image_size);
|
||||
printf("patch_size %d\n", hparams.patch_size);
|
||||
printf("v_hidden_size %d\n", hparams.hidden_size);
|
||||
printf("v_n_intermediate %d\n", hparams.n_intermediate);
|
||||
printf("v_projection_dim %d\n", hparams.projection_dim);
|
||||
printf("v_n_head %d\n", hparams.n_head);
|
||||
printf("v_n_layer %d\n", hparams.n_layer);
|
||||
printf("v_eps %f\n", hparams.eps);
|
||||
printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
||||
printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
||||
printf("v_image_grid_pinpoints: ");
|
||||
LOG_TEE("\n%s: vision model hparams\n", __func__);
|
||||
LOG_TEE("image_size %d\n", hparams.image_size);
|
||||
LOG_TEE("patch_size %d\n", hparams.patch_size);
|
||||
LOG_TEE("v_hidden_size %d\n", hparams.hidden_size);
|
||||
LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate);
|
||||
LOG_TEE("v_projection_dim %d\n", hparams.projection_dim);
|
||||
LOG_TEE("v_n_head %d\n", hparams.n_head);
|
||||
LOG_TEE("v_n_layer %d\n", hparams.n_layer);
|
||||
LOG_TEE("v_eps %f\n", hparams.eps);
|
||||
LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
||||
LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
||||
LOG_TEE("v_image_grid_pinpoints: ");
|
||||
for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) {
|
||||
printf("%d ", hparams.image_grid_pinpoints[i]);
|
||||
LOG_TEE("%d ", hparams.image_grid_pinpoints[i]);
|
||||
}
|
||||
printf("\n");
|
||||
printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
||||
|
||||
}
|
||||
|
||||
|
@ -1155,7 +1155,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
||||
} catch(const std::exception& e) {
|
||||
fprintf(stderr, "%s: failed to load vision model tensors\n", __func__);
|
||||
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
|
||||
}
|
||||
|
||||
// LLaVA projection
|
||||
|
@ -1184,7 +1184,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
} catch (std::runtime_error & e) { }
|
||||
try {
|
||||
vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE);
|
||||
// fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__);
|
||||
// LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__);
|
||||
} catch (std::runtime_error & e) { }
|
||||
} else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projection
|
||||
|
@ -1264,7 +1264,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch);
|
||||
ggml_gallocr_reserve(new_clip->compute_alloc, gf);
|
||||
size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0);
|
||||
printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||
LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0);
|
||||
}
|
||||
|
||||
return new_clip;
|
||||
|
@ -1304,7 +1304,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
|||
int nx, ny, nc;
|
||||
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname);
|
||||
LOG_TEE("%s: failed to load image '%s'\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
build_clip_img_from_data(data, nx, ny, img);
|
||||
|
@ -1316,7 +1316,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
|||
int nx, ny, nc;
|
||||
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
fprintf(stderr, "%s: failed to decode image bytes\n", __func__);
|
||||
LOG_TEE("%s: failed to decode image bytes\n", __func__);
|
||||
return false;
|
||||
}
|
||||
build_clip_img_from_data(data, nx, ny, img);
|
||||
|
@ -1506,7 +1506,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int> & or
|
|||
int downscaled_height = static_cast<int>(original_height * scale);
|
||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
||||
int wasted_resolution = (width * height) - effective_resolution;
|
||||
// fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
||||
max_effective_resolution = effective_resolution;
|
||||
min_wasted_resolution = wasted_resolution;
|
||||
|
@ -1545,7 +1545,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
|
|||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
||||
bool pad_to_square = true;
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
}
|
||||
auto & params = ctx->vision_model.hparams;
|
||||
|
@ -1622,7 +1622,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
|||
}
|
||||
|
||||
for (size_t i = 0; i < patches.size(); i++) {
|
||||
// printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
||||
// LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny);
|
||||
clip_image_u8_free(patches[i]);
|
||||
}
|
||||
|
||||
|
@ -1765,7 +1765,7 @@ int clip_n_patches(const struct clip_ctx * ctx) {
|
|||
|
||||
bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1777,7 +1777,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
|
|||
|
||||
bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) {
|
||||
if (!ctx->has_vision_encoder) {
|
||||
printf("This gguf file seems to have no vision encoder\n");
|
||||
LOG_TEE("This gguf file seems to have no vision encoder\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1939,7 +1939,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||
new_type = type;
|
||||
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
||||
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
||||
// fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
||||
// LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
||||
}
|
||||
const size_t n_elms = ggml_nelements(cur);
|
||||
float * f32_data;
|
||||
|
@ -1958,7 +1958,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||
f32_data = (float *)conv_buf.data();
|
||||
break;
|
||||
default:
|
||||
printf("Please use an input file in f32 or f16\n");
|
||||
LOG_TEE("Please use an input file in f32 or f16\n");
|
||||
gguf_free(ctx_out);
|
||||
return false;
|
||||
}
|
||||
|
@ -1985,7 +1985,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||
fout.put(0);
|
||||
}
|
||||
|
||||
printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
||||
LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
||||
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
|
@ -2001,8 +2001,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||
gguf_free(ctx_out);
|
||||
|
||||
{
|
||||
printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
||||
printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
||||
LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
||||
LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
#include "ggml.h"
|
||||
#include "log.h"
|
||||
#include "common.h"
|
||||
#include "clip.h"
|
||||
#include "llava.h"
|
||||
|
@ -18,7 +19,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_toke
|
|||
n_eval = n_batch;
|
||||
}
|
||||
if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval, *n_past, 0))) {
|
||||
fprintf(stderr, "%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
LOG_TEE("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
|
@ -45,7 +46,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
|
|||
const llama_token id = llama_sampling_sample(ctx_sampling, ctx_llama, NULL);
|
||||
llama_sampling_accept(ctx_sampling, ctx_llama, id, true);
|
||||
static std::string ret;
|
||||
if (id == llama_token_eos(llama_get_model(ctx_llama))) {
|
||||
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||
ret = "</s>";
|
||||
} else {
|
||||
ret = llama_token_to_piece(ctx_llama, id);
|
||||
|
@ -73,7 +74,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
|
|||
size_t img_base64_str_start, img_base64_str_end;
|
||||
find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
|
||||
if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
|
||||
fprintf(stderr, "%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
||||
LOG_TEE("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -87,7 +88,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip
|
|||
|
||||
auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
|
||||
if (!embed) {
|
||||
fprintf(stderr, "%s: could not load image from base64 string.\n", __func__);
|
||||
LOG_TEE("%s: could not load image from base64 string.\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -112,8 +113,8 @@ struct llava_context {
|
|||
};
|
||||
|
||||
static void show_additional_info(int /*argc*/, char ** argv) {
|
||||
fprintf(stderr, "\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
fprintf(stderr, " note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
||||
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||
}
|
||||
|
||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
|
||||
|
@ -123,18 +124,18 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
|||
auto prompt = params->prompt;
|
||||
if (prompt_contains_image(prompt)) {
|
||||
if (!params->image.empty()) {
|
||||
fprintf(stderr, "using base64 encoded image instead of command line image path\n");
|
||||
LOG_TEE("using base64 encoded image instead of command line image path\n");
|
||||
}
|
||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
|
||||
if (!embed) {
|
||||
fprintf(stderr, "%s: can't load image from prompt\n", __func__);
|
||||
LOG_TEE("%s: can't load image from prompt\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
params->prompt = remove_image_from_prompt(prompt);
|
||||
} else {
|
||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str());
|
||||
if (!embed) {
|
||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str());
|
||||
LOG_TEE("%s: is %s really an image file?\n", __func__, params->image.c_str());
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
@ -153,18 +154,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||
system_prompt = prompt.substr(0, image_pos);
|
||||
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
||||
printf("system_prompt: %s\n", system_prompt.c_str());
|
||||
LOG_TEE("system_prompt: %s\n", system_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
printf("user_prompt: %s\n", user_prompt.c_str());
|
||||
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
|
||||
if (params->verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -174,7 +175,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||
if (params->verbose_prompt) {
|
||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||
printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -185,7 +186,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||
|
||||
// generate the response
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
LOG_TEE("\n");
|
||||
|
||||
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
|
||||
std::string response = "";
|
||||
|
@ -224,7 +225,7 @@ static struct llava_context * llava_init(gpt_params * params) {
|
|||
|
||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -234,7 +235,7 @@ static struct llava_context * llava_init(gpt_params * params) {
|
|||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||
|
||||
if (ctx_llama == NULL) {
|
||||
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
|
||||
LOG_TEE("%s: error: failed to create the llama_context\n" , __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -257,6 +258,12 @@ static void llava_free(struct llava_context * ctx_llava) {
|
|||
llama_backend_free();
|
||||
}
|
||||
|
||||
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
|
||||
(void) level;
|
||||
(void) user_data;
|
||||
LOG_TEE("%s", text);
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_time_init();
|
||||
|
||||
|
@ -266,6 +273,14 @@ int main(int argc, char ** argv) {
|
|||
show_additional_info(argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_set_target(log_filename_generator("llava", "log"));
|
||||
LOG_TEE("Log start\n");
|
||||
log_dump_cmdline(argc, argv);
|
||||
llama_log_set(llama_log_callback_logTee, nullptr);
|
||||
#endif // LOG_DISABLE_LOGS
|
||||
|
||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||
gpt_print_usage(argc, argv, params);
|
||||
show_additional_info(argc, argv);
|
||||
|
@ -274,7 +289,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
auto ctx_llava = llava_init(¶ms);
|
||||
if (ctx_llava == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to init llava\n", __func__);
|
||||
LOG_TEE("%s: error: failed to init llava\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ static std::pair<int, int> select_best_resolution(const std::pair<int, int>& ori
|
|||
int downscaled_height = static_cast<int>(original_height * scale);
|
||||
int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
|
||||
int wasted_resolution = (width * height) - effective_resolution;
|
||||
// fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||
// LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
|
||||
if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
|
||||
max_effective_resolution = effective_resolution;
|
||||
min_wasted_resolution = wasted_resolution;
|
||||
|
@ -154,13 +154,13 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
|
|||
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
|
||||
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
|
||||
if (newline_tmp->buffer == NULL) {
|
||||
printf("newline_tmp tensor buffer is NULL\n");
|
||||
LOG_TEE("newline_tmp tensor buffer is NULL\n");
|
||||
}
|
||||
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
|
||||
} else {
|
||||
model.newline->data = newline_tmp->data;
|
||||
if (model.newline->data == NULL) {
|
||||
printf("newline_tmp tensor data is NULL\n");
|
||||
LOG_TEE("newline_tmp tensor data is NULL\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -224,7 +224,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
img_res_v.size = 0;
|
||||
img_res_v.data = nullptr;
|
||||
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
||||
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
|
||||
LOG_TEE("%s: unable to preprocess image\n", __func__);
|
||||
delete[] img_res_v.data;
|
||||
return false;
|
||||
}
|
||||
|
@ -239,7 +239,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
|
||||
delete[] img_res_v.data;
|
||||
if (!encoded) {
|
||||
fprintf(stderr, "Unable to encode image\n");
|
||||
LOG_TEE("Unable to encode image\n");
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -252,12 +252,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
|
||||
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
|
||||
if (!encoded) {
|
||||
fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
const int64_t t_img_enc_batch_us = ggml_time_us();
|
||||
printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
|
||||
|
||||
const int32_t * image_grid = clip_image_grid(ctx_clip);
|
||||
|
||||
|
@ -290,12 +290,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
|||
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
|
||||
}
|
||||
|
||||
printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
||||
LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);
|
||||
|
||||
const int64_t t_img_enc_end_us = ggml_time_us();
|
||||
float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0;
|
||||
|
||||
printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
||||
LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -305,7 +305,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
|||
int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama));
|
||||
auto n_image_embd = clip_n_mmproj_embd(ctx_clip);
|
||||
if (n_image_embd != n_llama_embd) {
|
||||
printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
||||
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -314,13 +314,13 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx *
|
|||
bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) {
|
||||
float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model
|
||||
if (!image_embd) {
|
||||
fprintf(stderr, "Unable to allocate memory for image embeddings\n");
|
||||
LOG_TEE("Unable to allocate memory for image embeddings\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
int n_img_pos;
|
||||
if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) {
|
||||
fprintf(stderr, "%s: cannot encode image, aborting\n", __func__);
|
||||
LOG_TEE("%s: cannot encode image, aborting\n", __func__);
|
||||
free(image_embd);
|
||||
return false;
|
||||
}
|
||||
|
@ -340,7 +340,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_
|
|||
}
|
||||
llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, };
|
||||
if (llama_decode(ctx_llama, batch)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
LOG_TEE("%s : failed to eval\n", __func__);
|
||||
return false;
|
||||
}
|
||||
*n_past += n_eval;
|
||||
|
@ -352,7 +352,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|||
clip_image_u8 * img = clip_image_u8_init();
|
||||
if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) {
|
||||
clip_image_u8_free(img);
|
||||
fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__);
|
||||
LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -361,7 +361,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|||
bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos);
|
||||
if (!image_embed_result) {
|
||||
clip_image_u8_free(img);
|
||||
fprintf(stderr, "%s: coulnd't embed the image\n", __func__);
|
||||
LOG_TEE("%s: coulnd't embed the image\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -375,7 +375,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c
|
|||
static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) {
|
||||
auto file = fopen(path, "rb");
|
||||
if (file == NULL) {
|
||||
fprintf(stderr, "%s: can't read file %s\n", __func__, path);
|
||||
LOG_TEE("%s: can't read file %s\n", __func__, path);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -385,7 +385,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
|
|||
|
||||
auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data
|
||||
if (buffer == NULL) {
|
||||
fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
||||
LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path);
|
||||
perror("Memory allocation error");
|
||||
fclose(file);
|
||||
return false;
|
||||
|
@ -410,7 +410,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx
|
|||
long image_bytes_length;
|
||||
auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length);
|
||||
if (!loaded) {
|
||||
fprintf(stderr, "%s: failed to load %s\n", __func__, image_path);
|
||||
LOG_TEE("%s: failed to load %s\n", __func__, image_path);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
|
|
@ -299,7 +299,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
fflush(stdout);
|
||||
|
||||
if (id == llama_token_eos(model)) {
|
||||
if (llama_token_is_eog(model, id)) {
|
||||
has_eos = true;
|
||||
}
|
||||
|
||||
|
|
|
@ -141,7 +141,7 @@ int main(int argc, char ** argv){
|
|||
printf("%s", token_str.c_str());
|
||||
}
|
||||
|
||||
if (id == llama_token_eos(model)) {
|
||||
if (llama_token_is_eog(model, id)) {
|
||||
has_eos = true;
|
||||
}
|
||||
|
||||
|
|
|
@ -795,8 +795,8 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
// deal with end of text token in interactive mode
|
||||
if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
|
||||
// deal with end of generation tokens in interactive mode
|
||||
if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
|
||||
LOG("found EOS token\n");
|
||||
|
||||
if (params.interactive) {
|
||||
|
@ -920,8 +920,8 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
// end of text token
|
||||
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
|
||||
// end of generation
|
||||
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
|
||||
LOG_TEE(" [end of text]\n");
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -359,7 +359,7 @@ int main(int argc, char ** argv) {
|
|||
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
|
||||
|
||||
if (client.n_decoded > 2 &&
|
||||
(id == llama_token_eos(model) ||
|
||||
(llama_token_is_eog(model, id) ||
|
||||
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
|
||||
client.response.find("User:") != std::string::npos ||
|
||||
client.response.find('\n') != std::string::npos)) {
|
||||
|
|
|
@ -252,8 +252,8 @@ int main(int argc, char ** argv) {
|
|||
// sample the most likely token
|
||||
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
|
||||
// is it an end of stream?
|
||||
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
|
||||
// is it an end of generation?
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||
LOG_TEE("\n");
|
||||
|
||||
break;
|
||||
|
|
File diff suppressed because it is too large
Load diff
File diff suppressed because one or more lines are too long
|
@ -1201,7 +1201,7 @@ struct server_context {
|
|||
});
|
||||
}
|
||||
|
||||
if (result.tok == llama_token_eos(model)) {
|
||||
if (llama_token_is_eog(model, result.tok)) {
|
||||
slot.stopped_eos = true;
|
||||
slot.has_next_token = false;
|
||||
|
||||
|
|
|
@ -29,7 +29,7 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`.
|
|||
cd ../../..
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ../
|
||||
cmake -DLLAMA_CURL=ON ../
|
||||
cmake --build . --target server
|
||||
```
|
||||
|
||||
|
|
|
@ -381,11 +381,6 @@ static json oaicompat_completion_params_parse(
|
|||
} else {
|
||||
llama_params["stop"] = json_value(body, "stop", json::array());
|
||||
}
|
||||
// Some chat templates don't use EOS token to stop generation
|
||||
// We must add their end sequences to list of stop words
|
||||
llama_params["stop"].push_back("<|im_end|>"); // chatml
|
||||
llama_params["stop"].push_back("<end_of_turn>"); // gemma
|
||||
llama_params["stop"].push_back("<|eot_id|>"); // llama-3
|
||||
|
||||
// Handle "response_format" field
|
||||
if (body.contains("response_format")) {
|
||||
|
|
|
@ -133,8 +133,8 @@ int main(int argc, char ** argv) {
|
|||
// sample the most likely token
|
||||
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||
|
||||
// is it an end of stream?
|
||||
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
|
||||
// is it an end of generation?
|
||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||
LOG_TEE("\n");
|
||||
|
||||
break;
|
||||
|
|
|
@ -360,7 +360,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
if (token_id == llama_token_eos(model_tgt)) {
|
||||
if (llama_token_is_eog(model_tgt, token_id)) {
|
||||
has_eos = true;
|
||||
}
|
||||
++n_predict;
|
||||
|
|
|
@ -73,6 +73,7 @@ struct my_llama_model {
|
|||
static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model";
|
||||
static const char * LLM_KV_TRAINING_TYPE = "training.type";
|
||||
|
||||
static const char * LLM_KV_GENERAL_NAME = "general.name";
|
||||
static const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture";
|
||||
static const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type";
|
||||
|
||||
|
@ -529,6 +530,7 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex
|
|||
|
||||
static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) {
|
||||
const char * arch = "llama";
|
||||
|
||||
enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
||||
|
||||
std::vector<char> keybuf;
|
||||
|
@ -540,6 +542,7 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo
|
|||
|
||||
// set arch
|
||||
gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch);
|
||||
gguf_set_val_str(fctx, LLM_KV_GENERAL_NAME, arch);
|
||||
gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype);
|
||||
|
||||
// set hparams
|
||||
|
|
|
@ -135,6 +135,7 @@ class MODEL_ARCH(IntEnum):
|
|||
XVERSE = auto()
|
||||
COMMAND_R = auto()
|
||||
DBRX = auto()
|
||||
OLMO = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
|
@ -210,6 +211,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.XVERSE: "xverse",
|
||||
MODEL_ARCH.COMMAND_R: "command-r",
|
||||
MODEL_ARCH.DBRX: "dbrx",
|
||||
MODEL_ARCH.OLMO: "olmo",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
|
@ -695,6 +697,17 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_DOWN_EXP,
|
||||
MODEL_TENSOR.FFN_UP_EXP,
|
||||
],
|
||||
MODEL_ARCH.OLMO: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
|
253
llama.cpp
253
llama.cpp
|
@ -222,6 +222,7 @@ enum llm_arch {
|
|||
LLM_ARCH_XVERSE,
|
||||
LLM_ARCH_COMMAND_R,
|
||||
LLM_ARCH_DBRX,
|
||||
LLM_ARCH_OLMO,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
|
@ -256,6 +257,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||
{ LLM_ARCH_XVERSE, "xverse" },
|
||||
{ LLM_ARCH_COMMAND_R, "command-r" },
|
||||
{ LLM_ARCH_DBRX, "dbrx" },
|
||||
{ LLM_ARCH_OLMO, "olmo" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
|
@ -990,6 +992,20 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_OLMO,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
|
@ -2104,7 +2120,7 @@ struct llama_vocab {
|
|||
id special_prefix_id = -1;
|
||||
id special_suffix_id = -1;
|
||||
id special_middle_id = -1;
|
||||
id special_eot_id = -1;
|
||||
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
||||
|
||||
bool add_space_prefix = true;
|
||||
|
||||
|
@ -3754,7 +3770,7 @@ static void llm_load_hparams(
|
|||
switch (hparams.n_layer) {
|
||||
case 22: model.type = e_model::MODEL_1B; break;
|
||||
case 26: model.type = e_model::MODEL_3B; break;
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 32: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_7B : e_model::MODEL_8B; break; // LLaMa 8B v3 uses GQA
|
||||
case 40: model.type = e_model::MODEL_13B; break;
|
||||
case 48: model.type = e_model::MODEL_34B; break;
|
||||
case 60: model.type = e_model::MODEL_30B; break;
|
||||
|
@ -4070,6 +4086,18 @@ static void llm_load_hparams(
|
|||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_OLMO:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 22: model.type = e_model::MODEL_1B; break;
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 80: model.type = e_model::MODEL_70B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
default: (void)0;
|
||||
}
|
||||
|
||||
|
@ -4151,7 +4179,10 @@ static void llm_load_vocab(
|
|||
vocab.special_prefix_id = 67;
|
||||
vocab.special_suffix_id = 69;
|
||||
vocab.special_middle_id = 68;
|
||||
vocab.special_eot_id = 70;
|
||||
// TODO: this is not EOT, it is "file separator" token, needs fix
|
||||
// https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572
|
||||
//vocab.special_eot_id = 70;
|
||||
vocab.special_eot_id = 107;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4280,6 +4311,7 @@ static void llm_load_vocab(
|
|||
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id },
|
||||
{ LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id },
|
||||
};
|
||||
|
||||
for (const auto & it : special_token_types) {
|
||||
const std::string & key = kv(std::get<0>(it));
|
||||
int32_t & id = std::get<1>(it);
|
||||
|
@ -4294,7 +4326,6 @@ static void llm_load_vocab(
|
|||
} else {
|
||||
id = new_id;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Handle add_bos_token and add_eos_token
|
||||
|
@ -4308,6 +4339,27 @@ static void llm_load_vocab(
|
|||
vocab.special_add_eos = int(temp);
|
||||
}
|
||||
}
|
||||
|
||||
// find EOT token: "<|eot_id|>", "<|im_emd|>", "<end_of_turn>", etc.
|
||||
//
|
||||
// TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID
|
||||
// for now, we apply this workaround to find the EOT token based on its text
|
||||
if (vocab.special_eot_id == -1) {
|
||||
for (const auto & t : vocab.token_to_id) {
|
||||
if (
|
||||
// TODO: gemma "<end_of_turn>" is exported as a normal token, so the following check does not work
|
||||
// need to fix convert script
|
||||
//vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL &&
|
||||
(t.first == "<|eot_id|>" ||
|
||||
t.first == "<|im_emd|>" ||
|
||||
t.first == "<end_of_turn>"
|
||||
)
|
||||
) {
|
||||
vocab.special_eot_id = t.second;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// build special tokens cache
|
||||
|
@ -4477,7 +4529,12 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|||
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
||||
if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); }
|
||||
if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); }
|
||||
|
||||
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
||||
if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); }
|
||||
if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); }
|
||||
if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); }
|
||||
if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); }
|
||||
}
|
||||
|
||||
// Returns false if cancelled by progress_callback
|
||||
|
@ -5666,6 +5723,37 @@ static bool llm_load_tensors(
|
|||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
|
||||
// output
|
||||
{
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||
// if output is NULL, init from the input tok embed
|
||||
if (model.output == NULL) {
|
||||
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||
|
||||
|
||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||
|
@ -10096,6 +10184,139 @@ struct llm_build_context {
|
|||
return gf;
|
||||
|
||||
}
|
||||
|
||||
// ref: https://allenai.org/olmo
|
||||
// based on the original build_llama() function, changes:
|
||||
// * non-parametric layer norm
|
||||
// * clamp qkv
|
||||
// * removed bias
|
||||
// * removed MoE
|
||||
struct ggml_cgraph * build_olmo() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
// norm
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
NULL, NULL,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||
cb(Qcur, "Qcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||
cb(Kcur, "Kcur", il);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
if (hparams.f_clamp_kqv > 0.0f) {
|
||||
Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv);
|
||||
cb(Vcur, "Vcur", il);
|
||||
}
|
||||
|
||||
Qcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
Kcur = ggml_rope_custom(
|
||||
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||
model.layers[il].wo, nullptr,
|
||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
}
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
n_tokens = n_outputs;
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
NULL, NULL,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
||||
if (layer_dir != nullptr) {
|
||||
cur = ggml_add(ctx0, cur, layer_dir);
|
||||
}
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
NULL, NULL,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
};
|
||||
|
||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||
|
@ -10301,6 +10522,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
{
|
||||
result = llm.build_dbrx();
|
||||
} break;
|
||||
case LLM_ARCH_OLMO:
|
||||
{
|
||||
result = llm.build_olmo();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
@ -13072,16 +13297,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|||
GGML_ASSERT(ctx);
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
bool allow_eos = false;
|
||||
bool allow_eog = false;
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
allow_eos = true;
|
||||
allow_eog = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const llama_token eos = llama_token_eos(&ctx->model);
|
||||
|
||||
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
||||
candidates_decoded.reserve(candidates->size);
|
||||
std::vector<llama_grammar_candidate> candidates_grammar;
|
||||
|
@ -13090,8 +13313,8 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|||
for (size_t i = 0; i < candidates->size; ++i) {
|
||||
const llama_token id = candidates->data[i].id;
|
||||
const std::string piece = llama_token_to_piece(ctx, id);
|
||||
if (id == eos) {
|
||||
if (!allow_eos) {
|
||||
if (llama_token_is_eog(&ctx->model, id)) {
|
||||
if (!allow_eog) {
|
||||
candidates->data[i].logit = -INFINITY;
|
||||
}
|
||||
} else if (piece.empty() || piece[0] == 0) {
|
||||
|
@ -13280,7 +13503,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
|
|||
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
|
||||
if (token == llama_token_eos(&ctx->model)) {
|
||||
if (llama_token_is_eog(&ctx->model, token)) {
|
||||
for (const auto & stack : grammar->stacks) {
|
||||
if (stack.empty()) {
|
||||
return;
|
||||
|
@ -15154,6 +15377,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
|||
case LLM_ARCH_MINICPM:
|
||||
case LLM_ARCH_XVERSE:
|
||||
case LLM_ARCH_COMMAND_R:
|
||||
case LLM_ARCH_OLMO:
|
||||
return LLAMA_ROPE_TYPE_NORM;
|
||||
|
||||
// the pairs of head values are offset by n_rot/2
|
||||
|
@ -16683,6 +16907,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to
|
|||
return model->vocab.id_to_token[token].type;
|
||||
}
|
||||
|
||||
bool llama_token_is_eog(const struct llama_model * model, llama_token token) {
|
||||
return token != -1 && (
|
||||
token == llama_token_eos(model) ||
|
||||
token == llama_token_eot(model)
|
||||
);
|
||||
}
|
||||
|
||||
llama_token llama_token_bos(const struct llama_model * model) {
|
||||
return model->vocab.special_bos_id;
|
||||
}
|
||||
|
|
5
llama.h
5
llama.h
|
@ -783,6 +783,9 @@ extern "C" {
|
|||
|
||||
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
|
||||
LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
|
||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||
|
@ -796,7 +799,7 @@ extern "C" {
|
|||
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
||||
|
||||
// codellama infill tokens
|
||||
// Codellama infill tokens
|
||||
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
||||
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
||||
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue