Merge branch 'ggerganov:master' into k-shift2
This commit is contained in:
commit
ae8b7eb43e
5 changed files with 63 additions and 103 deletions
|
@ -17,7 +17,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
|||
|
||||
## Hot topics
|
||||
|
||||
- **Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669**
|
||||
- **Introducing GGUF-my-LoRA** https://github.com/ggerganov/llama.cpp/discussions/10123
|
||||
- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
|
||||
- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||
|
||||
----
|
||||
|
|
|
@ -725,12 +725,12 @@ struct server_context {
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
server_slot * get_available_slot(const std::string & prompt) {
|
||||
server_slot * get_available_slot(const server_task & task) {
|
||||
server_slot * ret = nullptr;
|
||||
|
||||
// find the slot that has at least n% prompt similarity
|
||||
if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty()) {
|
||||
int max_lcp_len = 0;
|
||||
if (ret == nullptr && slot_prompt_similarity != 0.0f) {
|
||||
int max_lcs_len = 0;
|
||||
float similarity = 0;
|
||||
|
||||
for (server_slot & slot : slots) {
|
||||
|
@ -740,25 +740,25 @@ struct server_context {
|
|||
}
|
||||
|
||||
// skip the slot if it does not contains cached tokens
|
||||
if (slot.prompt_tokens.empty()) {
|
||||
if (slot.cache_tokens.empty()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// length of the Longest Common Prefix between the current slot's prompt and the input prompt
|
||||
int lcp_len = longest_common_prefix(slot.cache_tokens, slot.prompt_tokens);
|
||||
// length of the Longest Common Subsequence between the current slot's prompt and the input prompt
|
||||
int lcs_len = longest_common_subsequence(slot.cache_tokens, task.prompt_tokens);
|
||||
|
||||
// fraction of the common substring length compared to the current slot's prompt length
|
||||
similarity = static_cast<float>(lcp_len) / static_cast<int>(slot.prompt_tokens.size());
|
||||
// fraction of the common subsequence length compared to the current slot's prompt length
|
||||
similarity = static_cast<float>(lcs_len) / static_cast<int>(slot.cache_tokens.size());
|
||||
|
||||
// select the current slot if the criteria match
|
||||
if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity) {
|
||||
max_lcp_len = lcp_len;
|
||||
if (lcs_len > max_lcs_len && similarity > slot_prompt_similarity) {
|
||||
max_lcs_len = lcs_len;
|
||||
ret = &slot;
|
||||
}
|
||||
}
|
||||
|
||||
if (ret != nullptr) {
|
||||
SLT_DBG(*ret, "selected slot by lcp similarity, max_lcp_len = %d, similarity = %f\n", max_lcp_len, similarity);
|
||||
SLT_DBG(*ret, "selected slot by lcs similarity, max_lcs_len = %d, similarity = %f\n", max_lcs_len, similarity);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1516,18 +1516,7 @@ struct server_context {
|
|||
{
|
||||
const int id_slot = json_value(task.data, "id_slot", -1);
|
||||
|
||||
server_slot * slot;
|
||||
|
||||
if (id_slot != -1) {
|
||||
slot = get_slot_by_id(id_slot);
|
||||
} else {
|
||||
std::string prompt;
|
||||
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
|
||||
prompt = json_value(task.data, "prompt", std::string());
|
||||
}
|
||||
|
||||
slot = get_available_slot(prompt);
|
||||
}
|
||||
server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task);
|
||||
|
||||
if (slot == nullptr) {
|
||||
// if no slot is available, we defer this task for processing later
|
||||
|
|
|
@ -439,18 +439,60 @@ static std::string gen_chatcmplid() {
|
|||
// other common utils
|
||||
//
|
||||
|
||||
static size_t longest_common_prefix(const std::vector<llama_token> & a, const std::vector<llama_token> & b) {
|
||||
static size_t longest_common_prefix(const llama_tokens & a, const llama_tokens & b) {
|
||||
size_t i;
|
||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
static size_t longest_common_prefix(const std::string & a, const std::string & b) {
|
||||
size_t i;
|
||||
for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
|
||||
static size_t longest_common_subsequence(const llama_tokens & a, const llama_tokens & b) {
|
||||
// check for empty sequences
|
||||
if (a.empty() || b.empty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
return i;
|
||||
// get the lengths of the input sequences
|
||||
int a_len = a.size();
|
||||
int b_len = b.size();
|
||||
|
||||
// initialize the maximum length of the longest common subsequence (LCS)
|
||||
int max_length = 0;
|
||||
|
||||
// use two rows instead of a 2D matrix to optimize space
|
||||
std::vector<int> prev_row(b_len + 1, 0);
|
||||
std::vector<int> curr_row(b_len + 1, 0);
|
||||
|
||||
// iterate through the elements of a
|
||||
for (int i = 1; i <= a_len; i++) {
|
||||
// iterate through the elements of b
|
||||
for (int j = 1; j <= b_len; j++) {
|
||||
// if elements at the current positions match
|
||||
if (a[i - 1] == b[j - 1]) {
|
||||
// if it's the first element of either sequences, set LCS length to 1
|
||||
if (i == 1 || j == 1) {
|
||||
curr_row[j] = 1;
|
||||
} else {
|
||||
// increment LCS length by 1 compared to the previous element
|
||||
curr_row[j] = prev_row[j - 1] + 1;
|
||||
}
|
||||
|
||||
// update max_length if necessary
|
||||
if (curr_row[j] > max_length) {
|
||||
max_length = curr_row[j];
|
||||
}
|
||||
} else {
|
||||
// reset LCS length if elements don't match
|
||||
curr_row[j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// update the previous row for the next iteration
|
||||
prev_row = curr_row;
|
||||
}
|
||||
|
||||
// return the maximum length of the LCS
|
||||
return max_length;
|
||||
}
|
||||
|
||||
static bool ends_with(const std::string & str, const std::string & suffix) {
|
||||
|
|
|
@ -655,14 +655,6 @@ extern "C" {
|
|||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
// scratch buffer
|
||||
// TODO: deprecate and remove
|
||||
struct ggml_scratch {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
void * data;
|
||||
};
|
||||
|
||||
struct ggml_init_params {
|
||||
// memory pool
|
||||
size_t mem_size; // bytes
|
||||
|
@ -766,7 +758,6 @@ extern "C" {
|
|||
|
||||
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
||||
|
||||
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
|
||||
GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
|
||||
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
|
||||
|
||||
|
|
|
@ -2018,15 +2018,11 @@ struct ggml_context {
|
|||
void * mem_buffer;
|
||||
bool mem_buffer_owned;
|
||||
bool no_alloc;
|
||||
bool no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
|
||||
|
||||
int n_objects;
|
||||
|
||||
struct ggml_object * objects_begin;
|
||||
struct ggml_object * objects_end;
|
||||
|
||||
struct ggml_scratch scratch;
|
||||
struct ggml_scratch scratch_save;
|
||||
};
|
||||
|
||||
struct ggml_context_container {
|
||||
|
@ -3879,12 +3875,9 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : ggml_aligned_malloc(mem_size),
|
||||
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
||||
/*.no_alloc =*/ params.no_alloc,
|
||||
/*.no_alloc_save =*/ params.no_alloc,
|
||||
/*.n_objects =*/ 0,
|
||||
/*.objects_begin =*/ NULL,
|
||||
/*.objects_end =*/ NULL,
|
||||
/*.scratch =*/ { 0, 0, NULL, },
|
||||
/*.scratch_save =*/ { 0, 0, NULL, },
|
||||
};
|
||||
|
||||
GGML_ASSERT(ctx->mem_buffer != NULL);
|
||||
|
@ -3904,8 +3897,6 @@ void ggml_reset(struct ggml_context * ctx) {
|
|||
ctx->n_objects = 0;
|
||||
ctx->objects_begin = NULL;
|
||||
ctx->objects_end = NULL;
|
||||
ctx->scratch = (struct ggml_scratch) { 0, 0, NULL, };
|
||||
ctx->scratch_save = (struct ggml_scratch) { 0, 0, NULL, };
|
||||
}
|
||||
|
||||
void ggml_free(struct ggml_context * ctx) {
|
||||
|
@ -3924,14 +3915,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx) {
|
|||
return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
|
||||
}
|
||||
|
||||
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
|
||||
const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
|
||||
|
||||
ctx->scratch = scratch;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
bool ggml_get_no_alloc(struct ggml_context * ctx) {
|
||||
return ctx->no_alloc;
|
||||
}
|
||||
|
@ -3959,27 +3942,6 @@ size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) {
|
|||
return max_size;
|
||||
}
|
||||
|
||||
// IMPORTANT:
|
||||
// when creating "opt" tensors, always save and load the scratch buffer
|
||||
// this is an error prone process, but it is necessary to support inplace
|
||||
// operators when using scratch buffers
|
||||
// TODO: implement a better way
|
||||
static void ggml_scratch_save(struct ggml_context * ctx) {
|
||||
// this is needed to allow opt tensors to store their data
|
||||
// TODO: again, need to find a better way
|
||||
ctx->no_alloc_save = ctx->no_alloc;
|
||||
ctx->no_alloc = false;
|
||||
|
||||
ctx->scratch_save = ctx->scratch;
|
||||
ctx->scratch.data = NULL;
|
||||
}
|
||||
|
||||
static void ggml_scratch_load(struct ggml_context * ctx) {
|
||||
ctx->no_alloc = ctx->no_alloc_save;
|
||||
|
||||
ctx->scratch = ctx->scratch_save;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
|
||||
|
@ -4060,29 +4022,13 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|||
size_t obj_alloc_size = 0;
|
||||
|
||||
if (view_src == NULL && !ctx->no_alloc) {
|
||||
if (ctx->scratch.data != NULL) {
|
||||
// allocate tensor data in the scratch buffer
|
||||
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
||||
GGML_LOG_WARN("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
|
||||
__func__, ctx->scratch.offs + data_size, ctx->scratch.size);
|
||||
assert(false);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
|
||||
|
||||
ctx->scratch.offs += data_size;
|
||||
} else {
|
||||
// allocate tensor data in the context's memory pool
|
||||
obj_alloc_size = data_size;
|
||||
}
|
||||
// allocate tensor data in the context's memory pool
|
||||
obj_alloc_size = data_size;
|
||||
}
|
||||
|
||||
struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TYPE_TENSOR, GGML_TENSOR_SIZE + obj_alloc_size);
|
||||
GGML_ASSERT(obj_new);
|
||||
|
||||
// TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
|
||||
|
||||
struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
|
||||
|
||||
#ifdef __clang__
|
||||
|
@ -4178,24 +4124,16 @@ struct ggml_tensor * ggml_new_tensor_4d(
|
|||
}
|
||||
|
||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
ggml_set_i32(result, value);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {
|
||||
ggml_scratch_save(ctx);
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
||||
|
||||
ggml_scratch_load(ctx);
|
||||
|
||||
ggml_set_f32(result, value);
|
||||
|
||||
return result;
|
||||
|
@ -20263,7 +20201,6 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
|
|||
uint64_t size_eval = 0;
|
||||
|
||||
// compute size of intermediate results
|
||||
// TODO: does not take into account scratch buffers !!!!
|
||||
for (int i = 0; i < cgraph->n_nodes; ++i) {
|
||||
size_eval += ggml_nbytes_pad(cgraph->nodes[i]);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue