Merge branch 'master' into concedo_experimental
# Conflicts: # Makefile # tests/test-grad0.cpp # tests/test-quantize-perf.cpp
This commit is contained in:
commit
c2c238b4f3
20 changed files with 44 additions and 38 deletions
|
@ -355,6 +355,11 @@ else()
|
||||||
message(STATUS "Unknown architecture")
|
message(STATUS "Unknown architecture")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (MINGW)
|
||||||
|
# Target Windows 8 for PrefetchVirtualMemory
|
||||||
|
add_compile_definitions(_WIN32_WINNT=0x602)
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build libraries
|
# Build libraries
|
||||||
#
|
#
|
||||||
|
|
|
@ -61,13 +61,13 @@
|
||||||
// #define LOG_TARGET stderr
|
// #define LOG_TARGET stderr
|
||||||
// #include "log.h"
|
// #include "log.h"
|
||||||
//
|
//
|
||||||
// The log target can also be redirected to a diffrent function
|
// The log target can also be redirected to a different function
|
||||||
// like so:
|
// like so:
|
||||||
//
|
//
|
||||||
// #define LOG_TARGET log_handler_diffrent()
|
// #define LOG_TARGET log_handler_different()
|
||||||
// #include "log.h"
|
// #include "log.h"
|
||||||
//
|
//
|
||||||
// FILE* log_handler_diffrent()
|
// FILE* log_handler_different()
|
||||||
// {
|
// {
|
||||||
// return stderr;
|
// return stderr;
|
||||||
// }
|
// }
|
||||||
|
@ -421,7 +421,7 @@ inline FILE *log_handler2_impl(bool change = false, LogTriState append = LogTriS
|
||||||
|
|
||||||
// Disables logs entirely at runtime.
|
// Disables logs entirely at runtime.
|
||||||
// Makes LOG() and LOG_TEE() produce no output,
|
// Makes LOG() and LOG_TEE() produce no output,
|
||||||
// untill enabled back.
|
// until enabled back.
|
||||||
#define log_disable() log_disable_impl()
|
#define log_disable() log_disable_impl()
|
||||||
|
|
||||||
// INTERNAL, DO NOT USE
|
// INTERNAL, DO NOT USE
|
||||||
|
|
|
@ -585,7 +585,7 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
|
||||||
|
|
||||||
if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
|
if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
|
||||||
# Transformers models put different tensors in different files, but
|
# Transformers models put different tensors in different files, but
|
||||||
# don't split indivdual tensors between files.
|
# don't split individual tensors between files.
|
||||||
model: LazyModel = {}
|
model: LazyModel = {}
|
||||||
for mp in models_plus:
|
for mp in models_plus:
|
||||||
model.update(mp.model)
|
model.update(mp.model)
|
||||||
|
@ -678,7 +678,7 @@ class LazyUnpickler(pickle.Unpickler):
|
||||||
return func(*args)
|
return func(*args)
|
||||||
|
|
||||||
CLASSES: dict[tuple[str, str], Any] = {
|
CLASSES: dict[tuple[str, str], Any] = {
|
||||||
# getattr used here as a workaround for mypy not being smart enough to detrmine
|
# getattr used here as a workaround for mypy not being smart enough to determine
|
||||||
# the staticmethods have a __func__ attribute.
|
# the staticmethods have a __func__ attribute.
|
||||||
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
||||||
('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
|
('torch._utils', '_rebuild_tensor_v2'): getattr(lazy_rebuild_tensor_v2, '__func__'),
|
||||||
|
|
|
@ -739,7 +739,7 @@ bool clip_image_preprocess(const clip_ctx * ctx, const clip_image_u8 * img, clip
|
||||||
temp->ny = longer_side;
|
temp->ny = longer_side;
|
||||||
temp->size = 3 * longer_side * longer_side;
|
temp->size = 3 * longer_side * longer_side;
|
||||||
temp->data = new uint8_t[temp->size]();
|
temp->data = new uint8_t[temp->size]();
|
||||||
uint8_t bc[3] = {122, 116, 104}; // bakground color in RGB from LLaVA
|
uint8_t bc[3] = {122, 116, 104}; // background color in RGB from LLaVA
|
||||||
|
|
||||||
// fill with background color
|
// fill with background color
|
||||||
for (size_t i = 0; i < temp->size; i++) {
|
for (size_t i = 0; i < temp->size; i++) {
|
||||||
|
|
|
@ -51,7 +51,7 @@ def bytes_to_unicode():
|
||||||
The reversible bpe codes work on unicode strings.
|
The reversible bpe codes work on unicode strings.
|
||||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||||
This is a signficant percentage of your normal, say, 32K bpe vocab.
|
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# llama.cpp/examples/lookahead
|
# llama.cpp/examples/lookahead
|
||||||
|
|
||||||
Demonstartion of lookahead decoding technique:
|
Demonstration of lookahead decoding technique:
|
||||||
|
|
||||||
https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||||
|
|
||||||
|
|
|
@ -222,7 +222,7 @@ node index.js
|
||||||
|
|
||||||
`content`: Set the text to process.
|
`content`: Set the text to process.
|
||||||
|
|
||||||
**POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
|
- **POST** `/infill`: For code infilling. Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
|
|
|
@ -11227,7 +11227,7 @@ class binary_reader
|
||||||
}
|
}
|
||||||
if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array
|
if (is_ndarray) // ndarray dimensional vector can only contain integers, and can not embed another array
|
||||||
{
|
{
|
||||||
return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimentional vector is not allowed", "size"), nullptr));
|
return sax->parse_error(chars_read, get_token_string(), parse_error::create(113, chars_read, exception_message(input_format, "ndarray dimensional vector is not allowed", "size"), nullptr));
|
||||||
}
|
}
|
||||||
std::vector<size_t> dim;
|
std::vector<size_t> dim;
|
||||||
if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))
|
if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))
|
||||||
|
|
|
@ -114,7 +114,7 @@ export async function* llama(prompt, params = {}, config = {}) {
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Call llama, return an event target that you can subcribe to
|
// Call llama, return an event target that you can subscribe to
|
||||||
//
|
//
|
||||||
// Example:
|
// Example:
|
||||||
//
|
//
|
||||||
|
|
|
@ -223,7 +223,7 @@
|
||||||
repeat_last_n: 256, // 0 = disable penalty, -1 = context size
|
repeat_last_n: 256, // 0 = disable penalty, -1 = context size
|
||||||
repeat_penalty: 1.18, // 1.0 = disabled
|
repeat_penalty: 1.18, // 1.0 = disabled
|
||||||
top_k: 40, // <= 0 to use vocab size
|
top_k: 40, // <= 0 to use vocab size
|
||||||
top_p: 0.5, // 1.0 = disabled
|
top_p: 0.95, // 1.0 = disabled
|
||||||
min_p: 0.05, // 0 = disabled
|
min_p: 0.05, // 0 = disabled
|
||||||
tfs_z: 1.0, // 1.0 = disabled
|
tfs_z: 1.0, // 1.0 = disabled
|
||||||
typical_p: 1.0, // 1.0 = disabled
|
typical_p: 1.0, // 1.0 = disabled
|
||||||
|
@ -238,7 +238,7 @@
|
||||||
cache_prompt: true
|
cache_prompt: true
|
||||||
})
|
})
|
||||||
|
|
||||||
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */
|
/* START: Support for storing prompt templates and parameters in browsers LocalStorage */
|
||||||
|
|
||||||
const local_storage_storageKey = "llamacpp_server_local_storage";
|
const local_storage_storageKey = "llamacpp_server_local_storage";
|
||||||
|
|
||||||
|
@ -282,7 +282,7 @@
|
||||||
let importedTemplates = local_storage_getDataAsObject('user_templates')
|
let importedTemplates = local_storage_getDataAsObject('user_templates')
|
||||||
|
|
||||||
if (importedTemplates) {
|
if (importedTemplates) {
|
||||||
// saved templates were successfuly imported.
|
// saved templates were successfully imported.
|
||||||
|
|
||||||
console.log('Processing saved templates and updating default template')
|
console.log('Processing saved templates and updating default template')
|
||||||
params.value = { ...params.value, image_data: [] };
|
params.value = { ...params.value, image_data: [] };
|
||||||
|
@ -303,7 +303,7 @@
|
||||||
}
|
}
|
||||||
|
|
||||||
function userTemplateResetToDefault() {
|
function userTemplateResetToDefault() {
|
||||||
console.log('Reseting themplate to default')
|
console.log('Resetting template to default')
|
||||||
selectedUserTemplate.value.name = 'default';
|
selectedUserTemplate.value.name = 'default';
|
||||||
selectedUserTemplate.value.data = savedUserTemplates.value['default'];
|
selectedUserTemplate.value.data = savedUserTemplates.value['default'];
|
||||||
}
|
}
|
||||||
|
@ -762,7 +762,7 @@
|
||||||
|
|
||||||
<fieldset class="two">
|
<fieldset class="two">
|
||||||
${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
|
${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
|
||||||
${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
|
${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
|
||||||
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
|
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
|
||||||
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
|
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
|
||||||
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
|
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
|
||||||
|
|
|
@ -2383,6 +2383,7 @@ json oaicompat_completion_params_parse(
|
||||||
llama_params["__oaicompat"] = true;
|
llama_params["__oaicompat"] = true;
|
||||||
|
|
||||||
// Map OpenAI parameters to llama.cpp parameters
|
// Map OpenAI parameters to llama.cpp parameters
|
||||||
|
llama_params["model"] = json_value(body, "model", std::string("uknown"));
|
||||||
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
||||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||||
llama_params["temperature"] = json_value(body, "temperature", 0.8);
|
llama_params["temperature"] = json_value(body, "temperature", 0.8);
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# llama.cpp/examples/speculative
|
# llama.cpp/examples/speculative
|
||||||
|
|
||||||
Demonstartion of speculative decoding and tree-based speculative decoding techniques
|
Demonstration of speculative decoding and tree-based speculative decoding techniques
|
||||||
|
|
||||||
More info:
|
More info:
|
||||||
|
|
||||||
|
|
|
@ -430,7 +430,7 @@ int main(int argc, char ** argv) {
|
||||||
++n_past_tgt;
|
++n_past_tgt;
|
||||||
}
|
}
|
||||||
|
|
||||||
// the first token is always proposed by the traget model before the speculation loop so we erase it here
|
// the first token is always proposed by the target model before the speculation loop so we erase it here
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
if (!drafts[s].active) {
|
if (!drafts[s].active) {
|
||||||
continue;
|
continue;
|
||||||
|
|
|
@ -43,7 +43,7 @@ GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph
|
||||||
// ggml-backend v2 API
|
// ggml-backend v2 API
|
||||||
//
|
//
|
||||||
|
|
||||||
// Seperate tensor and graph allocator objects
|
// Separate tensor and graph allocator objects
|
||||||
// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
|
// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
|
||||||
// The original API is kept as a wrapper around the new API
|
// The original API is kept as a wrapper around the new API
|
||||||
|
|
||||||
|
|
|
@ -3116,7 +3116,7 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
size_t vl = __riscv_vsetvl_e8m1(qk/2);
|
||||||
|
|
||||||
// These tempory registers are for masking and shift operations
|
// These temporary registers are for masking and shift operations
|
||||||
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
|
vuint32m2_t vt_1 = __riscv_vid_v_u32m2(vl);
|
||||||
vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
|
vuint32m2_t vt_2 = __riscv_vsll_vv_u32m2(__riscv_vmv_v_x_u32m2(1, vl), vt_1, vl);
|
||||||
|
|
||||||
|
@ -4759,7 +4759,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
||||||
|
|
||||||
vl = 16;
|
vl = 16;
|
||||||
|
|
||||||
// retreive lane to multiply with scale
|
// retrieve lane to multiply with scale
|
||||||
vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
|
vint32m2_t aux0_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 0), (scale[0]), vl);
|
||||||
vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
|
vint32m2_t aux0_1 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a0, 1), (scale[1]), vl);
|
||||||
vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
|
vint32m2_t aux1_0 = __riscv_vwmul_vx_i32m2(__riscv_vget_v_i16m2_i16m1(a1, 0), (scale[2]), vl);
|
||||||
|
|
12
ggml.c
12
ggml.c
|
@ -1,4 +1,4 @@
|
||||||
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
|
||||||
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
||||||
|
|
||||||
#include "ggml-impl.h"
|
#include "ggml-impl.h"
|
||||||
|
@ -33,7 +33,7 @@
|
||||||
// we should just be careful :)
|
// we should just be careful :)
|
||||||
#pragma warning(disable: 4244 4267)
|
#pragma warning(disable: 4244 4267)
|
||||||
|
|
||||||
// disable POSIX deprecation warnigns
|
// disable POSIX deprecation warnings
|
||||||
// these functions are never going away, anyway
|
// these functions are never going away, anyway
|
||||||
#pragma warning(disable: 4996)
|
#pragma warning(disable: 4996)
|
||||||
#endif
|
#endif
|
||||||
|
@ -1760,7 +1760,7 @@ static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size
|
||||||
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
||||||
|
|
||||||
// WARN:
|
// WARN:
|
||||||
// Mis-confguration can lead to problem that's hard to reason about:
|
// Mis-configuration can lead to problem that's hard to reason about:
|
||||||
// * At best it crash or talks nosense.
|
// * At best it crash or talks nosense.
|
||||||
// * At worst it talks slightly difference but hard to perceive.
|
// * At worst it talks slightly difference but hard to perceive.
|
||||||
//
|
//
|
||||||
|
@ -7520,7 +7520,7 @@ static void ggml_compute_forward_acc_f32(
|
||||||
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
||||||
|
|
||||||
// view src0 and dst with these strides and data offset inbytes during acc
|
// view src0 and dst with these strides and data offset inbytes during acc
|
||||||
// nb0 is implicitely element_size because src0 and dst are contiguous
|
// nb0 is implicitly element_size because src0 and dst are contiguous
|
||||||
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
||||||
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
||||||
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
||||||
|
@ -10161,7 +10161,7 @@ static void ggml_compute_forward_set_f32(
|
||||||
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
||||||
|
|
||||||
// view src0 and dst with these strides and data offset inbytes during set
|
// view src0 and dst with these strides and data offset inbytes during set
|
||||||
// nb0 is implicitely element_size because src0 and dst are contiguous
|
// nb0 is implicitly element_size because src0 and dst are contiguous
|
||||||
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
size_t nb1 = ((int32_t *) dst->op_params)[0];
|
||||||
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
size_t nb2 = ((int32_t *) dst->op_params)[1];
|
||||||
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
size_t nb3 = ((int32_t *) dst->op_params)[2];
|
||||||
|
@ -14475,7 +14475,7 @@ void ggml_build_backward_gradient_checkpointing(
|
||||||
// insert new tensors recomputing src, reusing already made replacements,
|
// insert new tensors recomputing src, reusing already made replacements,
|
||||||
// remember replacements: remember new tensors with mapping from corresponding gf nodes
|
// remember replacements: remember new tensors with mapping from corresponding gf nodes
|
||||||
// recurse for input tensors,
|
// recurse for input tensors,
|
||||||
// unless (i.e. terminating when) input tensors are replacments (like checkpoints)
|
// unless (i.e. terminating when) input tensors are replacements (like checkpoints)
|
||||||
node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
|
node->src[k] = ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
|
||||||
}
|
}
|
||||||
// insert rewritten backward node with replacements made into resulting backward graph gb
|
// insert rewritten backward node with replacements made into resulting backward graph gb
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -215,7 +215,7 @@
|
||||||
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
||||||
|
|
||||||
#define GGML_MAX_DIMS 4
|
#define GGML_MAX_DIMS 4
|
||||||
#define GGML_MAX_PARAMS 1024
|
#define GGML_MAX_PARAMS 2048
|
||||||
#define GGML_MAX_CONTEXTS 64
|
#define GGML_MAX_CONTEXTS 64
|
||||||
#define GGML_MAX_SRC 6
|
#define GGML_MAX_SRC 6
|
||||||
#define GGML_MAX_NAME 64
|
#define GGML_MAX_NAME 64
|
||||||
|
|
|
@ -61,7 +61,7 @@ If you want to publish the package manually for any reason, you need to have `tw
|
||||||
pip install build twine
|
pip install build twine
|
||||||
```
|
```
|
||||||
|
|
||||||
Then, folow these steps to release a new version:
|
Then, follow these steps to release a new version:
|
||||||
|
|
||||||
1. Bump the version in `pyproject.toml`.
|
1. Bump the version in `pyproject.toml`.
|
||||||
2. Build the package:
|
2. Build the package:
|
||||||
|
|
14
llama.cpp
14
llama.cpp
|
@ -2788,7 +2788,7 @@ static void llm_load_vocab(
|
||||||
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
// The assumption is, since special tokens aren't meant to be exposed to end user, they are designed
|
||||||
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
// to be unmatchable by the tokenizer, therefore tokens from the vocab, which are unmatchable by the tokenizer
|
||||||
// are special tokens.
|
// are special tokens.
|
||||||
// From testing, this appears to corelate 1:1 with special tokens.
|
// From testing, this appears to correlate 1:1 with special tokens.
|
||||||
//
|
//
|
||||||
|
|
||||||
// Counting special tokens and verifying in only one direction
|
// Counting special tokens and verifying in only one direction
|
||||||
|
@ -5876,7 +5876,7 @@ static int llama_decode_internal(
|
||||||
const int64_t n_embd = hparams.n_embd;
|
const int64_t n_embd = hparams.n_embd;
|
||||||
const int64_t n_vocab = hparams.n_vocab;
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
// helpers for smoother batch API transistion
|
// helpers for smoother batch API transition
|
||||||
// after deprecating the llama_eval calls, these will be removed
|
// after deprecating the llama_eval calls, these will be removed
|
||||||
std::vector<llama_pos> pos;
|
std::vector<llama_pos> pos;
|
||||||
|
|
||||||
|
@ -6876,12 +6876,12 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
||||||
|
|
||||||
// loop over the text
|
// loop over the text
|
||||||
while (true) {
|
while (true) {
|
||||||
// find the first occurence of a given special token in this fragment
|
// find the first occurrence of a given special token in this fragment
|
||||||
// passing offset argument only limit the "search area" but match coordinates
|
// passing offset argument only limit the "search area" but match coordinates
|
||||||
// are still relative to the source full raw_text
|
// are still relative to the source full raw_text
|
||||||
auto match = raw_text->find(special_token, raw_text_base_offset);
|
auto match = raw_text->find(special_token, raw_text_base_offset);
|
||||||
|
|
||||||
// no occurences found, stop processing this fragment for a given special token
|
// no occurrences found, stop processing this fragment for a given special token
|
||||||
if (match == std::string::npos) break;
|
if (match == std::string::npos) break;
|
||||||
|
|
||||||
// check if match is within bounds of offset <-> length
|
// check if match is within bounds of offset <-> length
|
||||||
|
@ -7766,7 +7766,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
const llama_token id = candidates->data[i].id;
|
const llama_token id = candidates->data[i].id;
|
||||||
const std::string & piece = ctx->model.vocab.id_to_token[id].text;
|
const std::string piece = llama_token_to_piece(ctx, id);
|
||||||
if (id == eos) {
|
if (id == eos) {
|
||||||
if (!allow_eos) {
|
if (!allow_eos) {
|
||||||
candidates->data[i].logit = -INFINITY;
|
candidates->data[i].logit = -INFINITY;
|
||||||
|
@ -7978,7 +7978,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string & piece = ctx->model.vocab.id_to_token[token].text;
|
const std::string piece = llama_token_to_piece(ctx, token);
|
||||||
|
|
||||||
// Note terminating 0 in decoded string
|
// Note terminating 0 in decoded string
|
||||||
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
||||||
|
@ -8092,7 +8092,7 @@ struct llama_beam_search_data {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
||||||
// The repetative patterns below reflect the 2 stages of heaps:
|
// The repetitive patterns below reflect the 2 stages of heaps:
|
||||||
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
||||||
// * If the heap is full and a new element is found that should be included, pop the
|
// * If the heap is full and a new element is found that should be included, pop the
|
||||||
// least element to the back(), replace it with the new, then push it into the heap.
|
// least element to the back(), replace it with the new, then push it into the heap.
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -216,7 +216,7 @@ extern "C" {
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
};
|
};
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue