Merge remote-tracking branch 'origin/master' into bins
This commit is contained in:
commit
daeaeb1222
30 changed files with 1233 additions and 607 deletions
|
@ -1,19 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./llama-cli -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
||||
--color \
|
||||
-f ./prompts/alpaca.txt \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
--temp 0.2 \
|
||||
--repeat_penalty 1.1 \
|
||||
-t 7
|
|
@ -1,15 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./llama-cli --color --instruct --threads 4 \
|
||||
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
||||
--file ./prompts/alpaca.txt \
|
||||
--batch_size 8 --ctx_size 2048 -n -1 \
|
||||
--repeat_last_n 64 --repeat_penalty 1.3 \
|
||||
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
|
|
@ -218,20 +218,64 @@ void IMatrixCollector::save_imatrix(int ncall) const {
|
|||
fname += std::to_string(ncall);
|
||||
}
|
||||
|
||||
// avoid writing imatrix entries that do not have full data
|
||||
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
|
||||
|
||||
int n_entries = 0;
|
||||
std::vector<std::string> to_store;
|
||||
|
||||
bool is_first = true; // for printing
|
||||
for (const auto & kv : m_stats) {
|
||||
const int n_all = kv.second.counts.size();
|
||||
|
||||
if (n_all == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
int n_zeros = 0;
|
||||
for (const int c : kv.second.counts) {
|
||||
if (c == 0) {
|
||||
n_zeros++;
|
||||
}
|
||||
}
|
||||
|
||||
if (n_zeros != 0 && is_first) {
|
||||
fprintf(stderr, "\n");
|
||||
is_first = false;
|
||||
}
|
||||
|
||||
if (n_zeros == n_all) {
|
||||
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (n_zeros > 0) {
|
||||
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
|
||||
continue;
|
||||
}
|
||||
|
||||
n_entries++;
|
||||
to_store.push_back(kv.first);
|
||||
}
|
||||
|
||||
if (to_store.size() < m_stats.size()) {
|
||||
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
|
||||
}
|
||||
|
||||
std::ofstream out(fname, std::ios::binary);
|
||||
int n_entries = m_stats.size();
|
||||
out.write((const char *) &n_entries, sizeof(n_entries));
|
||||
for (const auto & p : m_stats) {
|
||||
int len = p.first.size();
|
||||
for (const auto & name : to_store) {
|
||||
const auto & stat = m_stats.at(name);
|
||||
int len = name.size();
|
||||
out.write((const char *) &len, sizeof(len));
|
||||
out.write(p.first.c_str(), len);
|
||||
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
|
||||
int nval = p.second.values.size();
|
||||
out.write(name.c_str(), len);
|
||||
out.write((const char *) &stat.ncall, sizeof(stat.ncall));
|
||||
int nval = stat.values.size();
|
||||
out.write((const char *) &nval, sizeof(nval));
|
||||
if (nval > 0) {
|
||||
std::vector<float> tmp(nval);
|
||||
for (int i = 0; i < nval; i++) {
|
||||
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
|
||||
tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
|
||||
}
|
||||
out.write((const char*)tmp.data(), nval*sizeof(float));
|
||||
}
|
||||
|
|
|
@ -1,18 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./llama-cli -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
--temp 0.2 \
|
||||
--repeat_penalty 1.1 \
|
||||
-t 8
|
|
@ -1,18 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
#
|
||||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./llama-cli -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
|
||||
--color \
|
||||
--ctx_size 2048 \
|
||||
-n -1 \
|
||||
-ins -b 256 \
|
||||
--top_k 10000 \
|
||||
--temp 0.2 \
|
||||
--repeat_penalty 1.1 \
|
||||
-t 8
|
|
@ -6,10 +6,6 @@
|
|||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
#include "ggml-sycl.h"
|
||||
#endif
|
||||
|
||||
#include "ggml-rpc.h"
|
||||
#ifdef _WIN32
|
||||
# include <windows.h>
|
||||
|
@ -83,12 +79,6 @@ static ggml_backend_t create_backend() {
|
|||
if (!backend) {
|
||||
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
||||
}
|
||||
#elif GGML_USE_SYCL
|
||||
fprintf(stderr, "%s: using SYCL backend\n", __func__);
|
||||
backend = ggml_backend_sycl_init(0); // init device 0
|
||||
if (!backend) {
|
||||
fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
|
||||
}
|
||||
#endif
|
||||
|
||||
// if there aren't GPU Backends fallback to CPU backend
|
||||
|
|
|
@ -416,7 +416,7 @@
|
|||
message = html`<${Probabilities} data=${data} />`
|
||||
} else {
|
||||
const text = isArrayMessage ?
|
||||
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
|
||||
data.map(msg => msg.content).join('') :
|
||||
data;
|
||||
message = isCompletionMode ?
|
||||
text :
|
||||
|
|
|
@ -147,7 +147,7 @@ struct server_slot {
|
|||
int32_t n_prompt_tokens = 0;
|
||||
int32_t n_prompt_tokens_processed = 0;
|
||||
|
||||
json prompt;
|
||||
std::string prompt;
|
||||
|
||||
// when a task is submitted, we first tokenize the prompt and store it here
|
||||
std::vector<llama_token> prompt_tokens;
|
||||
|
@ -822,13 +822,8 @@ struct server_context {
|
|||
continue;
|
||||
}
|
||||
|
||||
// skip the slot if it does not contains prompt
|
||||
if (!slot.prompt.is_string()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// current slot's prompt
|
||||
std::string slot_prompt = slot.prompt.get<std::string>();
|
||||
std::string slot_prompt = slot.prompt;
|
||||
|
||||
// length of the current slot's prompt
|
||||
int slot_prompt_len = slot_prompt.size();
|
||||
|
@ -958,13 +953,16 @@ struct server_context {
|
|||
if (!task.infill) {
|
||||
const auto & prompt = data.find("prompt");
|
||||
if (prompt == data.end()) {
|
||||
send_error(task, "Either \"prompt\" or \"messages\" must be provided", ERROR_TYPE_INVALID_REQUEST);
|
||||
send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
} else {
|
||||
slot.prompt = *prompt;
|
||||
}
|
||||
if (slot.prompt.is_array() && slot.prompt.size() == 0) {
|
||||
send_error(task, "\"prompt\" cannot be an empty array", ERROR_TYPE_INVALID_REQUEST);
|
||||
|
||||
if (prompt->is_string()) {
|
||||
slot.prompt = prompt->get<std::string>();
|
||||
} else if (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) {
|
||||
slot.prompt = prompt->at(0).get<std::string>();
|
||||
} else {
|
||||
send_error(task, "\"prompt\" must be a string or an array of strings", ERROR_TYPE_INVALID_REQUEST);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -1582,14 +1580,18 @@ struct server_context {
|
|||
switch (task.type) {
|
||||
case SERVER_TASK_TYPE_COMPLETION:
|
||||
{
|
||||
int id_slot = json_value(task.data, "id_slot", -1);
|
||||
std::string prompt = json_value(task.data, "prompt", std::string());
|
||||
const int id_slot = json_value(task.data, "id_slot", -1);
|
||||
|
||||
server_slot * slot;
|
||||
|
||||
if (id_slot != -1) {
|
||||
slot = get_slot_by_id(id_slot);
|
||||
} else {
|
||||
std::string prompt;
|
||||
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
|
||||
json_value(task.data, "prompt", std::string());
|
||||
}
|
||||
|
||||
slot = get_available_slot(prompt);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue