Merge branch 'master' into compilade/refactor-kv-cache
This commit is contained in:
commit
ff794f5535
50 changed files with 3235 additions and 2041 deletions
|
@ -1,19 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
||||
--color \
|
||||
-f ./prompts/alpaca.txt \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
--temp 0.2 \
|
||||
--repeat_penalty 1.1 \
|
||||
-t 7
|
|
@ -1,15 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main --color --instruct --threads 4 \
|
||||
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
||||
--file ./prompts/alpaca.txt \
|
||||
--batch_size 8 --ctx_size 2048 -n -1 \
|
||||
--repeat_last_n 64 --repeat_penalty 1.3 \
|
||||
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
|
|
@ -218,20 +218,64 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
|||
fname += std::to_string(ncall);
|
||||
}
|
||||
|
||||
// avoid writing imatrix entries that do not have full data
|
||||
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
|
||||
|
||||
int n_entries = 0;
|
||||
std::vector<std::string> to_store;
|
||||
|
||||
bool is_first = true; // for printing
|
||||
for (const auto & kv : m_stats) {
|
||||
const int n_all = kv.second.counts.size();
|
||||
|
||||
if (n_all == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int n_zeros = 0;
|
||||
for (const int c : kv.second.counts) {
|
||||
if (c == 0) {
|
||||
n_zeros++;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_zeros != 0 && is_first) {
|
||||
fprintf(stderr, "\n");
|
||||
is_first = false;
|
||||
}
|
||||
|
||||
if (n_zeros == n_all) {
|
||||
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (n_zeros > 0) {
|
||||
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
||||
continue;
|
||||
}
|
||||
|
||||
n_entries++;
|
||||
to_store.push_back(kv.first);
|
||||
}
|
||||
|
||||
if (to_store.size() < m_stats.size()) {
|
||||
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
|
||||
}
|
||||
|
||||
std::ofstream out(fname, std::ios::binary);
|
||||
int n_entries = m_stats.size();
|
||||
out.write((const char *) &n_entries, sizeof(n_entries));
|
||||
for (const auto & p : m_stats) {
|
||||
int len = p.first.size();
|
||||
for (const auto & name : to_store) {
|
||||
const auto & stat = m_stats.at(name);
|
||||
int len = name.size();
|
||||
out.write((const char *) &len, sizeof(len));
|
||||
out.write(p.first.c_str(), len);
|
||||
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
|
||||
int nval = p.second.values.size();
|
||||
out.write(name.c_str(), len);
|
||||
out.write((const char *) &stat.ncall, sizeof(stat.ncall));
|
||||
int nval = stat.values.size();
|
||||
out.write((const char *) &nval, sizeof(nval));
|
||||
if (nval > 0) {
|
||||
std::vector<float> tmp(nval);
|
||||
for (int i = 0; i < nval; i++) {
|
||||
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
|
||||
tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
|
||||
}
|
||||
out.write((const char*)tmp.data(), nval*sizeof(float));
|
||||
}
|
||||
|
|
|
@ -29,9 +29,8 @@ class BuiltinRule:
|
|||
self.content = content
|
||||
self.deps = deps or []
|
||||
|
||||
# whitespace is constrained to a single space char to prevent model "running away" in
|
||||
# whitespace. Also maybe improves generation quality?
|
||||
SPACE_RULE = '" "?'
|
||||
# Constraining spaces to prevent model "running away".
|
||||
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
|
||||
|
||||
PRIMITIVE_RULES = {
|
||||
'boolean' : BuiltinRule('("true" | "false") space', []),
|
||||
|
@ -43,7 +42,7 @@ PRIMITIVE_RULES = {
|
|||
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
|
||||
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
|
||||
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
|
||||
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F]{4})', []),
|
||||
'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
|
||||
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
|
||||
'null' : BuiltinRule('"null" space', []),
|
||||
}
|
||||
|
|
|
@ -1033,6 +1033,27 @@ struct markdown_printer : public printer {
|
|||
if (field == "n_gpu_layers") {
|
||||
return 3;
|
||||
}
|
||||
if (field == "n_threads") {
|
||||
return 7;
|
||||
}
|
||||
if (field == "n_batch") {
|
||||
return 7;
|
||||
}
|
||||
if (field == "n_ubatch") {
|
||||
return 8;
|
||||
}
|
||||
if (field == "type_k" || field == "type_v") {
|
||||
return 6;
|
||||
}
|
||||
if (field == "split_mode") {
|
||||
return 5;
|
||||
}
|
||||
if (field == "flash_attn") {
|
||||
return 2;
|
||||
}
|
||||
if (field == "use_mmap") {
|
||||
return 4;
|
||||
}
|
||||
if (field == "test") {
|
||||
return 13;
|
||||
}
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
--temp 0.2 \
|
||||
--repeat_penalty 1.1 \
|
||||
-t 8
|
|
@ -1,18 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
--temp 0.2 \
|
||||
--repeat_penalty 1.1 \
|
||||
-t 8
|
|
@ -6,10 +6,6 @@
|
|||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
#include "ggml-sycl.h"
|
||||
#endif
|
||||
|
||||
#include "ggml-rpc.h"
|
||||
#ifdef _WIN32
|
||||
# include <windows.h>
|
||||
|
@ -83,12 +79,6 @@ static ggml_backend_t create_backend() {
|
|||
if (!backend) {
|
||||
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
||||
}
|
||||
#elif GGML_USE_SYCL
|
||||
fprintf(stderr, "%s: using SYCL backend\n", __func__);
|
||||
backend = ggml_backend_sycl_init(0); // init device 0
|
||||
if (!backend) {
|
||||
fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
|
||||
}
|
||||
#endif
|
||||
|
||||
// if there aren't GPU Backends fallback to CPU backend
|
||||
|
|
|
@ -416,7 +416,7 @@
|
|||
message = html`<${Probabilities} data=${data} />`
|
||||
} else {
|
||||
const text = isArrayMessage ?
|
||||
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
|
||||
data.map(msg => msg.content).join('') :
|
||||
data;
|
||||
message = isCompletionMode ?
|
||||
text :
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// WARNING: This file was ported from json_schema_to_grammar.py, please fix bugs / add features there first.
|
||||
const SPACE_RULE = '" "?';
|
||||
const SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}';
|
||||
|
||||
function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
|
||||
if (minItems === 0 && maxItems === 1) {
|
||||
|
@ -41,7 +41,7 @@ const PRIMITIVE_RULES = {
|
|||
object : new BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
|
||||
array : new BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
|
||||
uuid : new BuiltinRule('"\\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\\"" space', []),
|
||||
char : new BuiltinRule(`[^"\\\\] | "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F]{4})`, []),
|
||||
char : new BuiltinRule(`[^"\\\\\\x7F\\x00-\\x1F] | [\\\\] (["\\\\bfnrt] | "u" [0-9a-fA-F]{4})`, []),
|
||||
string : new BuiltinRule(`"\\"" char* "\\"" space`, ['char']),
|
||||
null : new BuiltinRule('"null" space', []),
|
||||
};
|
||||
|
|
|
@ -147,7 +147,7 @@ struct server_slot {
|
|||
int32_t n_prompt_tokens = 0;
|
||||
int32_t n_prompt_tokens_processed = 0;
|
||||
|
||||
json prompt;
|
||||
json prompt; // can be either a string, array of strings or array of token ids
|
||||
|
||||
// when a task is submitted, we first tokenize the prompt and store it here
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
|
@ -958,13 +958,16 @@ struct server_context {
|
|||
if (!task.infill) {
|
||||
const auto & prompt = data.find("prompt");
|
||||
if (prompt == data.end()) {
|
||||
send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);
|
||||
send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
} else {
|
||||
slot.prompt = *prompt;
|
||||
}
|
||||
if (slot.prompt.is_array() && slot.prompt.size() == 0) {
|
||||
send_error(task, "\"prompt\" cannot be an empty array", ERROR_TYPE_INVALID_REQUEST);
|
||||
|
||||
if ((prompt->is_string()) ||
|
||||
(prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) ||
|
||||
(prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer())) {
|
||||
slot.prompt = *prompt;
|
||||
} else {
|
||||
send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -1582,14 +1585,18 @@ struct server_context {
|
|||
switch (task.type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
{
|
||||
int id_slot = json_value(task.data, "id_slot", -1);
|
||||
std::string prompt = json_value(task.data, "prompt", std::string());
|
||||
const int id_slot = json_value(task.data, "id_slot", -1);
|
||||
|
||||
server_slot * slot;
|
||||
|
||||
if (id_slot != -1) {
|
||||
slot = get_slot_by_id(id_slot);
|
||||
} else {
|
||||
std::string prompt;
|
||||
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
|
||||
json_value(task.data, "prompt", std::string());
|
||||
}
|
||||
|
||||
slot = get_available_slot(prompt);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue