Merge branch 'master' into concedo_experimental

# Conflicts:
#	.gitignore
#	CMakeLists.txt
#	Makefile
#	README.md
#	build.zig
#	ggml-opencl.cpp
#	tests/CMakeLists.txt
#	tests/test-double-float.cpp
#	tests/test-sampling.cpp
This commit is contained in:
Concedo 2023-10-25 23:58:15 +08:00
commit 5db89b90b7
72 changed files with 5422 additions and 7821 deletions

View file

@ -1,8 +1,7 @@
---
name: Issue and enhancement template
about: Used to report issues and request enhancements for llama.cpp
title: "[User] Insert summary of your issue or enhancement.."
labels: ''
name: Bug template
about: Used to report bugs in llama.cpp
labels: ["bug"]
assignees: ''
---
@ -46,7 +45,7 @@ $ g++ --version
# Failure Information (for bugs)
Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template.
Please help provide information about the failure / bug.
# Steps to Reproduce

28
.github/ISSUE_TEMPLATE/enhancement.md vendored Normal file
View file

@ -0,0 +1,28 @@
---
name: Enhancement template
about: Used to request enhancements for llama.cpp
labels: ["enhancement"]
assignees: ''
---
# Prerequisites
Please answer the following questions for yourself before submitting an issue.
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
# Feature Description
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
# Motivation
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
# Possible Implementation
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.

View file

@ -107,7 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
std::string arg;
gpt_params default_params;
const std::string arg_prefix = "--";
llama_sampling_params & sparams = params.sampling_params;
llama_sampling_params & sparams = params.sparams;
for (int i = 1; i < argc; i++) {
arg = argv[i];
@ -241,25 +241,26 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
sparams.repeat_last_n = std::stoi(argv[i]);
sparams.penalty_last_n = std::stoi(argv[i]);
sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n);
} else if (arg == "--repeat-penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.repeat_penalty = std::stof(argv[i]);
sparams.penalty_repeat = std::stof(argv[i]);
} else if (arg == "--frequency-penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.frequency_penalty = std::stof(argv[i]);
sparams.penalty_freq = std::stof(argv[i]);
} else if (arg == "--presence-penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.presence_penalty = std::stof(argv[i]);
sparams.penalty_present = std::stof(argv[i]);
} else if (arg == "--mirostat") {
if (++i >= argc) {
invalid_param = true;
@ -572,7 +573,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
invalid_param = true;
break;
}
params.grammar = argv[i];
sparams.grammar = argv[i];
} else if (arg == "--grammar-file") {
if (++i >= argc) {
invalid_param = true;
@ -587,7 +588,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
std::copy(
std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>(),
std::back_inserter(params.grammar)
std::back_inserter(sparams.grammar)
);
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
@ -631,6 +632,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
process_escapes(params.prompt);
process_escapes(params.input_prefix);
process_escapes(params.input_suffix);
process_escapes(sparams.cfg_negative_prompt);
for (auto & antiprompt : params.antiprompt) {
process_escapes(antiprompt);
}
@ -640,7 +642,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
}
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
const llama_sampling_params & sparams = params.sampling_params;
const llama_sampling_params & sparams = params.sparams;
printf("usage: %s [options]\n", argv[0]);
printf("\n");
@ -678,10 +680,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n);
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty);
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty);
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty);
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
printf(" --mirostat N use Mirostat sampling.\n");
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
@ -878,13 +880,13 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
}
if (params.ignore_eos) {
params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY;
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
}
{
LOG("warming up the model with an empty run\n");
std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
llama_kv_cache_tokens_rm(lctx, -1, -1);
llama_reset_timings(lctx);
@ -939,7 +941,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
}
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
const llama_token bos_id = llama_token_bos(ctx);
const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
std::string piece;
std::string result;
@ -1123,28 +1125,28 @@ std::string get_sortable_timestamp() {
void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
const llama_sampling_params & sparams = params.sampling_params;
const llama_sampling_params & sparams = params.sparams;
fprintf(stream, "build_commit: %s\n", BUILD_COMMIT);
fprintf(stream, "build_number: %d\n", BUILD_NUMBER);
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
#ifdef NDEBUG
fprintf(stream, "debug: false\n");
@ -1178,13 +1180,13 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty);
dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx));
const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
@ -1238,14 +1240,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false");
fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty);
fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty);
fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
fprintf(stream, "reverse_prompt:\n");
for (std::string ap : params.antiprompt) {

View file

@ -69,7 +69,7 @@ struct gpt_params {
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
// // sampling parameters
struct llama_sampling_params sampling_params;
struct llama_sampling_params sparams;
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_draft = ""; // draft model for speculative decoding
@ -79,7 +79,6 @@ struct gpt_params {
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::string grammar = ""; // optional BNF-like grammar to constrain sampling
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files

View file

@ -399,7 +399,7 @@ namespace grammar_parser {
void print_grammar(FILE * file, const parse_state & state) {
try {
std::map<uint32_t, std::string> symbol_id_names;
for (auto kv : state.symbol_ids) {
for (const auto & kv : state.symbol_ids) {
symbol_id_names[kv.second] = kv.first;
}
for (size_t i = 0, end = state.rules.size(); i < end; i++) {

View file

@ -97,22 +97,23 @@
#define LOG_TEE_TARGET stderr
#endif
// NOTE: currently disabled as it produces too many log files
// Utility to obtain "pid" like unique process id and use it when creating log files.
inline std::string log_get_pid()
{
static std::string pid;
if (pid.empty())
{
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
// it's not the same as "pid" but is unique enough to solve multiple instances
// trying to write to the same log.
std::stringstream ss;
ss << std::this_thread::get_id();
pid = ss.str();
}
return pid;
}
//inline std::string log_get_pid()
//{
// static std::string pid;
// if (pid.empty())
// {
// // std::this_thread::get_id() is the most portable way of obtaining a "process id"
// // it's not the same as "pid" but is unique enough to solve multiple instances
// // trying to write to the same log.
// std::stringstream ss;
// ss << std::this_thread::get_id();
// pid = ss.str();
// }
//
// return pid;
//}
// Utility function for generating log file names with unique id based on thread id.
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
@ -126,8 +127,8 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
std::stringstream buf;
buf << log_file_basename;
buf << ".";
buf << log_get_pid();
//buf << ".";
//buf << log_get_pid();
buf << ".";
buf << log_file_extension;

View file

@ -1,9 +1,9 @@
#include "sampling.h"
struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params) {
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) {
struct llama_sampling_context * result = new llama_sampling_context();
result->params = params.sampling_params;
result->params = params;
result->grammar = nullptr;
// if there is a grammar, parse it
@ -23,7 +23,7 @@ struct llama_sampling_context * llama_sampling_init(const struct gpt_params & pa
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
}
result->prev.resize(params.n_ctx);
result->prev.resize(params.n_prev);
return result;
}
@ -66,25 +66,56 @@ void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * ds
dst->prev = src->prev;
}
llama_token llama_sampling_last(llama_sampling_context * ctx) {
return ctx->prev.back();
}
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n) {
const int size = ctx_sampling->prev.size();
n = std::min(n, size);
std::string result;
for (int i = size - n; i < size; i++) {
result += llama_token_to_piece(ctx_main, ctx_sampling->prev[i]);
}
return result;
}
std::string llama_sampling_print(const llama_sampling_params & params) {
char result[1024];
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, typical_p = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp,
params.mirostat, params.mirostat_eta, params.mirostat_tau);
return std::string(result);
}
llama_token llama_sampling_sample(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
struct llama_context * ctx_cfg,
const int idx) {
const int n_ctx = llama_n_ctx(ctx_main);
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const llama_sampling_params & params = ctx_sampling->params;
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
const float repeat_penalty = params.repeat_penalty;
const float alpha_presence = params.presence_penalty;
const float alpha_frequency = params.frequency_penalty;
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
const float penalty_repeat = params.penalty_repeat;
const float penalty_freq = params.penalty_freq;
const float penalty_present = params.penalty_present;
const int mirostat = params.mirostat;
const float mirostat_tau = params.mirostat_tau;
const float mirostat_eta = params.mirostat_eta;
@ -97,7 +128,7 @@ llama_token llama_sampling_sample(
float * logits = llama_get_logits_ith(ctx_main, idx);
// Apply params.logit_bias map
// apply params.logit_bias map
for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
@ -116,19 +147,15 @@ llama_token llama_sampling_sample(
// apply penalties
if (!prev.empty()) {
const float nl_logit = logits[llama_token_nl(ctx_main)];
const int last_n_repeat = std::min(std::min((int)prev.size(), repeat_last_n), n_ctx);
const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
llama_sample_repetition_penalty(ctx_main, &cur_p,
prev.data() + prev.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
llama_sample_frequency_and_presence_penalties(ctx_main, &cur_p,
prev.data() + prev.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
llama_sample_repetition_penalties(ctx_main, &cur_p,
prev.data() + prev.size() - penalty_last_n,
penalty_last_n, penalty_repeat, penalty_freq, penalty_present);
if (!penalize_nl) {
for (size_t idx = 0; idx < cur_p.size; idx++) {
if (cur_p.data[idx].id == llama_token_nl(ctx_main)) {
if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
cur_p.data[idx].logit = nl_logit;
break;
}
@ -141,7 +168,7 @@ llama_token llama_sampling_sample(
}
if (temp <= 0) {
// Greedy sampling
// greedy sampling
id = llama_sample_token_greedy(ctx_main, &cur_p);
} else {
if (mirostat == 1) {
@ -152,8 +179,9 @@ llama_token llama_sampling_sample(
llama_sample_temp(ctx_main, &cur_p, temp);
id = llama_sample_token_mirostat_v2(ctx_main, &cur_p, mirostat_tau, mirostat_eta, &ctx_sampling->mirostat_mu);
} else {
// Temperature sampling
// temperature sampling
size_t min_keep = std::max(1, params.n_probs);
llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep);
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
@ -183,11 +211,12 @@ llama_token llama_sampling_sample(
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
llama_token id) {
llama_token id,
bool apply_grammar) {
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
ctx_sampling->prev.push_back(id);
if (ctx_sampling->grammar != NULL) {
if (ctx_sampling->grammar != NULL && apply_grammar) {
llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
}
}

View file

@ -10,30 +10,30 @@
// sampling parameters
typedef struct llama_sampling_params {
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled
float repeat_penalty = 1.10f; // 1.0 = disabled
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float frequency_penalty = 0.00f; // 0.0 = disabled
float presence_penalty = 0.00f; // 0.0 = disabled
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.10f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = true; // consider newlines as a repeatable token
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
std::string grammar; // optional BNF-like grammar to constrain sampling
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // how strong is guidance
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
} llama_sampling_params;
// general sampler context
@ -58,7 +58,7 @@ struct llama_sampling_context {
#include "common.h"
// Create a new sampling context instance.
struct llama_sampling_context * llama_sampling_init(const struct gpt_params & params);
struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params);
void llama_sampling_free(struct llama_sampling_context * ctx);
@ -70,6 +70,15 @@ void llama_sampling_reset(llama_sampling_context * ctx);
// Copy the sampler context
void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst);
// Get the last sampled token
llama_token llama_sampling_last(llama_sampling_context * ctx);
// Get a string representation of the last sampled tokens
std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama_context * ctx_main, int n);
// Print sampling parameters into a string
std::string llama_sampling_print(const llama_sampling_params & params);
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call
@ -96,4 +105,5 @@ llama_token llama_sampling_sample(
void llama_sampling_accept(
struct llama_sampling_context * ctx_sampling,
struct llama_context * ctx_main,
llama_token id);
llama_token id,
bool apply_grammar);

View file

@ -236,8 +236,8 @@ int64_t get_example_targets_batch(
int64_t used_samples = 0;
ggml_set_f32(target_probs, 0.0f);
llama_token bos = llama_token_bos(lctx);
llama_token eos = llama_token_eos(lctx);
llama_token bos = llama_token_bos(llama_get_model(lctx));
llama_token eos = llama_token_eos(llama_get_model(lctx));
// printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples);
for (int k=0; k<n_batch; ++k) {
// printf("%s: batch %d\n", __func__, k);
@ -924,7 +924,7 @@ size_t tokenize_file(
for (llama_token token=0; token < n_vocab; ++token) {
max_token_text_size = std::max(
max_token_text_size,
strlen(llama_token_get_text(lctx, token)));
strlen(llama_token_get_text(llama_get_model(lctx), token)));
}
// upper bound of context byte length.
@ -1425,7 +1425,7 @@ void train_opt_callback(void * vdata, int accum_step, float * sched, bool * canc
int impr_plot = -(int)(1 + (opt->loss_before - opt->loss_after) * 10.0f + 0.5f);
if (impr_plot > 0) impr_plot = 0;
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_before)) impr_plot = 0;
if (std::isnan(opt->loss_before) || std::isnan(opt->loss_after)) impr_plot = 0;
printf("%s: iter=%6d sample=%zu/%zu sched=%f loss=%f",
__func__, opt->iter, std::min(1+train->shuffle_next_sample, train->shuffle_sample_count), train->shuffle_sample_count,
*sched, opt->loss_after);

View file

@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace:
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
return parser.parse_args()
args = parse_args()
@ -86,6 +87,11 @@ if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
endianess = gguf.GGUFEndian.LITTLE
if args.bigendian:
endianess = gguf.GGUFEndian.BIG
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
print(f"gguf: Conversion Endianess {endianess}")
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
@ -104,7 +110,7 @@ print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
print("hello print: ",hparams["architectures"][0])
if hparams["architectures"][0] != "BaichuanForCausalLM":
if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
@ -113,7 +119,7 @@ if hparams["architectures"][0] != "BaichuanForCausalLM":
num_parts = count_model_parts(dir_model)
print(f"num_parts:{num_parts}\n")
ARCH=gguf.MODEL_ARCH.BAICHUAN
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
print("gguf: get model metadata")
@ -224,7 +230,7 @@ gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model)
special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
special_vocab.add_to_gguf(gguf_writer)
# TENSORS

View file

@ -118,18 +118,27 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
special_vocab.add_to_gguf(gguf_writer)
# TENSORS

View file

@ -78,7 +78,7 @@ print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
if hparams["architectures"][0] != "FalconForCausalLM":
if hparams["architectures"][0] not in ("RWForCausalLM", "FalconForCausalLM"):
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit(1)
@ -97,7 +97,17 @@ gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
block_count = hparams.get("num_hidden_layers")
if block_count is None:
block_count = hparams["n_layer"] # old name
n_head = hparams.get("num_attention_heads")
if n_head is None:
n_head = hparams["n_head"] # old name
n_head_kv = hparams.get("num_kv_heads")
if n_head_kv is None:
n_head_kv = hparams.get("n_head_kv", 1) # old name
gguf_writer.add_name("Falcon")
gguf_writer.add_context_length(2048) # not in config.json
@ -105,11 +115,8 @@ gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_head_count(hparams["num_attention_heads"])
if "num_kv_heads" in hparams:
gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
else:
gguf_writer.add_head_count_kv(1)
gguf_writer.add_head_count(n_head)
gguf_writer.add_head_count_kv(n_head_kv)
gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
gguf_writer.add_file_type(ftype)
@ -145,17 +152,13 @@ gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# params for qkv transform
n_head = hparams["num_attention_heads"]
n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
head_dim = hparams["hidden_size"] // n_head
# tensor info

View file

@ -123,18 +123,27 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
special_vocab.add_to_gguf(gguf_writer)
# TENSORS

View file

@ -388,7 +388,9 @@ def handle_metadata(cfg, hp):
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
cfg.vocabtype )
# FIXME: Respect cfg.vocab_dir?
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
load_merges = cfg.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
convert.check_vocab_size(params, vocab)
return (params, vocab, svocab)

View file

@ -128,18 +128,27 @@ vocab_size = hparams["vocab_size"]
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
special_vocab.add_to_gguf(gguf_writer)
# TENSORS

View file

@ -139,18 +139,27 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True, n_vocab = len(tokens))
special_vocab.add_to_gguf(gguf_writer)
# TENSORS

View file

@ -111,18 +111,26 @@ tokenizer = AutoTokenizer.from_pretrained(dir_model)
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
assert max(tokenizer.vocab.values()) < vocab_size
added_vocab = tokenizer.get_added_vocab()
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
for i in range(vocab_size):
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
scores.append(0.0) # dummy
toktypes.append(gguf.TokenType.NORMAL)
if i not in reverse_vocab:
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
if tokenizer.added_tokens_decoder[i].special:
toktypes.append(gguf.TokenType.CONTROL)
else:
toktypes.append(gguf.TokenType.USER_DEFINED)
else:
tokens.append(reverse_vocab[i])
toktypes.append(gguf.TokenType.NORMAL)
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
special_vocab.add_to_gguf(gguf_writer)
# TENSORS

View file

@ -369,7 +369,7 @@ class SentencePieceVocab:
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items]
@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
class OutputFile:
def __init__(self, fname_out: Path) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
def add_meta_arch(self, params: Params) -> None:
name = "LLaMA"
@ -875,10 +875,10 @@ class OutputFile:
self.gguf.close()
@staticmethod
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)
of = OutputFile(fname_out)
of = OutputFile(fname_out, endianess=endianess)
# meta data
of.add_meta_arch(params)
@ -903,10 +903,10 @@ class OutputFile:
return dt.quantize(arr)
@staticmethod
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)
of = OutputFile(fname_out)
of = OutputFile(fname_out, endianess=endianess)
# meta data
of.add_meta_arch(params)
@ -1123,8 +1123,9 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
args = parser.parse_args(args_in)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
args = parser.parse_args(args_in)
if args.dump_single:
model_plus = lazy_load_file(args.model)
do_dump_model(model_plus)
@ -1138,6 +1139,9 @@ def main(args_in: list[str] | None = None) -> None:
if args.dump:
do_dump_model(model_plus)
return
endianess = gguf.GGUFEndian.LITTLE
if args.bigendian:
endianess = gguf.GGUFEndian.BIG
params = Params.load(model_plus)
if params.n_ctx == -1:
@ -1159,10 +1163,13 @@ def main(args_in: list[str] | None = None) -> None:
vocab: Vocab
if args.vocab_only:
assert args.outfile, "need --outfile if using --vocab-only"
if not args.outfile:
raise ValueError("need --outfile if using --vocab-only")
# FIXME: Try to respect vocab_dir somehow?
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
load_merges = args.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
outfile = args.outfile
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
print(f"Wrote {outfile}")
@ -1174,7 +1181,9 @@ def main(args_in: list[str] | None = None) -> None:
vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent
vocab = load_vocab(vocab_dir, args.vocabtype)
# FIXME: Try to respect vocab_dir somehow?
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent, load_merges = args.vocabtype == 'bpe')
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
load_merges = args.vocabtype == 'bpe',
n_vocab = vocab.vocab_size)
model = model_plus.model
model = convert_model_names(model, params)
@ -1185,7 +1194,7 @@ def main(args_in: list[str] | None = None) -> None:
params.ftype = ftype
print(f"Writing {outfile}, format {ftype}")
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
print(f"Wrote {outfile}")

View file

@ -12,26 +12,26 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
else()
add_subdirectory(main)
add_subdirectory(quantize)
add_subdirectory(quantize-stats)
add_subdirectory(perplexity)
add_subdirectory(embedding)
add_subdirectory(save-load-state)
add_subdirectory(benchmark)
add_subdirectory(baby-llama)
add_subdirectory(train-text-from-scratch)
add_subdirectory(finetune)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(simple)
add_subdirectory(batched)
add_subdirectory(batched-bench)
add_subdirectory(speculative)
add_subdirectory(parallel)
add_subdirectory(embd-input)
add_subdirectory(llava)
add_subdirectory(llama-bench)
add_subdirectory(beam-search)
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(finetune)
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
add_subdirectory(main)
add_subdirectory(parallel)
add_subdirectory(perplexity)
add_subdirectory(quantize)
add_subdirectory(quantize-stats)
add_subdirectory(save-load-state)
add_subdirectory(simple)
add_subdirectory(speculative)
add_subdirectory(train-text-from-scratch)
if (LLAMA_METAL)
add_subdirectory(metal)
endif()

View file

@ -154,6 +154,10 @@ int main(int argc, char ** argv) {
}
}
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");

View file

@ -11,12 +11,19 @@ int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL]\n" , argv[0]);
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
return 1 ;
}
// number of parallel batches
int n_parallel = 1;
// total length of the sequences including the prompt
int n_len = 32;
// number of layers to offload to the GPU
int n_gpu_layers = 0;
if (argc >= 2) {
params.model = argv[1];
}
@ -29,13 +36,18 @@ int main(int argc, char ** argv) {
n_parallel = std::atoi(argv[3]);
}
if (argc >= 5) {
n_len = std::atoi(argv[4]);
}
if (argc >= 6) {
n_gpu_layers = std::atoi(argv[5]);
}
if (params.prompt.empty()) {
params.prompt = "Hello my name is";
}
// total length of the sequences including the prompt
const int n_len = 32;
// init LLM
llama_backend_init(params.numa);
@ -44,7 +56,7 @@ int main(int argc, char ** argv) {
llama_model_params model_params = llama_model_default_params();
// model_params.n_gpu_layers = 99; // offload all layers to the GPU
model_params.n_gpu_layers = n_gpu_layers;
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -175,7 +187,7 @@ int main(int argc, char ** argv) {
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream? -> mark the stream as finished
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
i_batch[i] = -1;
LOG_TEE("\n");
if (n_parallel > 1) {

View file

@ -47,7 +47,7 @@ struct beam_search_callback_data {
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx));
}
// Function matching type llama_beam_search_callback_fn_t.

View file

@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
if (file.size < 4) {
return false;
}
uint32_t magic = file.read_u32();
std::string magic = file.read_string(4);
return magic == GGUF_MAGIC;
}

View file

@ -1,4 +0,0 @@
PandaGPT
MiniGPT-4
*.pth

View file

@ -1,17 +0,0 @@
set(TARGET embdinput)
add_library(${TARGET} embd-input-lib.cpp embd-input.h)
install(TARGETS ${TARGET} LIBRARY)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()
set(TARGET embd-input-test)
add_executable(${TARGET} embd-input-test.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama embdinput ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -1,63 +0,0 @@
### Examples for input embedding directly
## Requirement
build `libembdinput.so`
run the following comman in main dir (../../).
```
make
```
## [LLaVA](https://github.com/haotian-liu/LLaVA/) example (llava.py)
1. Obtian LLaVA model (following https://github.com/haotian-liu/LLaVA/ , use https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/).
2. Convert it to ggml format.
3. `llava_projection.pth` is [pytorch_model-00003-of-00003.bin](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin).
```
import torch
bin_path = "../LLaVA-13b-delta-v1-1/pytorch_model-00003-of-00003.bin"
pth_path = "./examples/embd-input/llava_projection.pth"
dic = torch.load(bin_path)
used_key = ["model.mm_projector.weight","model.mm_projector.bias"]
torch.save({k: dic[k] for k in used_key}, pth_path)
```
4. Check the path of LLaVA model and `llava_projection.pth` in `llava.py`.
## [PandaGPT](https://github.com/yxuansu/PandaGPT) example (panda_gpt.py)
1. Obtian PandaGPT lora model from https://github.com/yxuansu/PandaGPT. Rename the file to `adapter_model.bin`. Use [convert-lora-to-ggml.py](../../convert-lora-to-ggml.py) to convert it to ggml format.
The `adapter_config.json` is
```
{
"peft_type": "LORA",
"fan_in_fan_out": false,
"bias": null,
"modules_to_save": null,
"r": 32,
"lora_alpha": 32,
"lora_dropout": 0.1,
"target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"]
}
```
2. Papare the `vicuna` v0 model.
3. Obtain the [ImageBind](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) model.
4. Clone the PandaGPT source.
```
git clone https://github.com/yxuansu/PandaGPT
```
5. Install the requirement of PandaGPT.
6. Check the path of PandaGPT source, ImageBind model, lora model and vicuna model in panda_gpt.py.
## [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4/) example (minigpt4.py)
1. Obtain MiniGPT-4 model from https://github.com/Vision-CAIR/MiniGPT-4/ and put it in `embd-input`.
2. Clone the MiniGPT-4 source.
```
git clone https://github.com/Vision-CAIR/MiniGPT-4/
```
3. Install the requirement of PandaGPT.
4. Papare the `vicuna` v0 model.
5. Check the path of MiniGPT-4 source, MiniGPT-4 model and vicuna model in `minigpt4.py`.

View file

@ -1,221 +0,0 @@
#include "build-info.h"
#include "common.h"
#include "embd-input.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
static llama_context ** g_ctx;
extern "C" {
struct MyModel* create_mymodel(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
return nullptr;
}
print_build_info();
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = uint32_t(time(NULL));
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == NULL) {
fprintf(stderr, "%s: error: unable to load model\n", __func__);
return nullptr;
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str());
}
struct MyModel * ret = new MyModel();
ret->ctx = ctx;
ret->params = params;
ret->n_past = 0;
// printf("ctx: %d\n", ret->ctx);
return ret;
}
void free_mymodel(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
llama_print_timings(ctx);
llama_free(ctx);
delete mymodel;
}
bool eval_float(void * model, float * input, int N){
MyModel * mymodel = (MyModel*)model;
llama_context * ctx = mymodel->ctx;
gpt_params params = mymodel->params;
int n_emb = llama_n_embd(llama_get_model(ctx));
int n_past = mymodel->n_past;
int n_batch = N; // params.n_batch;
for (int i = 0; i < (int) N; i += n_batch) {
int n_eval = (int) N - i;
if (n_eval > n_batch) {
n_eval = n_batch;
}
llama_batch batch = { int32_t(n_eval), nullptr, (input+i*n_emb), nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
n_past += n_eval;
}
mymodel->n_past = n_past;
return true;
}
bool eval_tokens(void * model, std::vector<llama_token> tokens) {
MyModel * mymodel = (MyModel* )model;
llama_context * ctx;
ctx = mymodel->ctx;
gpt_params params = mymodel->params;
int n_past = mymodel->n_past;
for (int i = 0; i < (int) tokens.size(); i += params.n_batch) {
int n_eval = (int) tokens.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
if (llama_decode(ctx, llama_batch_get_one(&tokens[i], n_eval, n_past, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
}
n_past += n_eval;
}
mymodel->n_past = n_past;
return true;
}
bool eval_id(struct MyModel* mymodel, int id) {
std::vector<llama_token> tokens;
tokens.push_back(id);
return eval_tokens(mymodel, tokens);
}
bool eval_string(struct MyModel * mymodel,const char* str){
llama_context * ctx = mymodel->ctx;
std::string str2 = str;
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx, str2, true);
eval_tokens(mymodel, embd_inp);
return true;
}
llama_token sampling_id(struct MyModel* mymodel) {
llama_context* ctx = mymodel->ctx;
gpt_params params = mymodel->params;
llama_sampling_params & sparams = params.sampling_params;
// int n_ctx = llama_n_ctx(ctx);
// out of user input, sample next token
const float temp = sparams.temp;
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k;
const float top_p = sparams.top_p;
const float tfs_z = sparams.tfs_z;
const float typical_p = sparams.typical_p;
// const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
// const float repeat_penalty = params.repeat_penalty;
// const float alpha_presence = params.presence_penalty;
// const float alpha_frequency = params.frequency_penalty;
const int mirostat = sparams.mirostat;
const float mirostat_tau = sparams.mirostat_tau;
const float mirostat_eta = sparams.mirostat_eta;
// const bool penalize_nl = params.penalize_nl;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
// Apply params.logit_bias map
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// TODO: Apply penalties
// float nl_logit = logits[llama_token_nl(ctx)];
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
// llama_sample_repetition_penalty(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, repeat_penalty);
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence);
// if (!penalize_nl) {
// logits[llama_token_nl(ctx)] = nl_logit;
// }
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx, &candidates_p);
} else {
if (mirostat == 1) {
static float mirostat_mu = 2.0f * mirostat_tau;
const int mirostat_m = 100;
llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu);
} else if (mirostat == 2) {
static float mirostat_mu = 2.0f * mirostat_tau;
llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
} else {
// Temperature sampling
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
llama_sample_typical(ctx, &candidates_p, typical_p, 1);
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
llama_sample_temp(ctx, &candidates_p, temp);
id = llama_sample_token(ctx, &candidates_p);
}
}
}
return id;
}
const char * sampling(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
int id = sampling_id(mymodel);
static std::string ret;
if (id == llama_token_eos(ctx)) {
ret = "</s>";
} else {
ret = llama_token_to_piece(ctx, id);
}
eval_id(mymodel, id);
return ret.c_str();
}
}

View file

@ -1,35 +0,0 @@
#include "embd-input.h"
#include <stdlib.h>
#include <random>
#include <string.h>
int main(int argc, char** argv) {
auto mymodel = create_mymodel(argc, argv);
int N = 10;
int max_tgt_len = 500;
int n_embd = llama_n_embd(llama_get_model(mymodel->ctx));
// add random float embd to test evaluation
float * data = new float[N*n_embd];
std::default_random_engine e;
std::uniform_real_distribution<float> u(0,1);
for (int i=0;i<N*n_embd;i++) {
data[i] = u(e);
}
eval_string(mymodel, "user: what is the color of the flag of UN?");
eval_float(mymodel, data, N);
eval_string(mymodel, "assistant:");
eval_string(mymodel, mymodel->params.prompt.c_str());
const char* tmp;
for (int i=0; i<max_tgt_len; i++) {
tmp = sampling(mymodel);
if (strcmp(tmp, "</s>")==0) break;
printf("%s", tmp);
fflush(stdout);
}
printf("\n");
free_mymodel(mymodel);
return 0;
}

View file

@ -1,27 +0,0 @@
#ifndef _EMBD_INPUT_H_
#define _EMBD_INPUT_H_ 1
#include "common.h"
#include "llama.h"
extern "C" {
typedef struct MyModel {
llama_context* ctx;
gpt_params params;
int n_past = 0;
} MyModel;
struct MyModel* create_mymodel(int argc, char ** argv);
bool eval_float(void* model, float* input, int N);
bool eval_tokens(void* model, std::vector<llama_token> tokens);
bool eval_id(struct MyModel* mymodel, int id);
bool eval_string(struct MyModel* mymodel, const char* str);
const char * sampling(struct MyModel* mymodel);
llama_token sampling_id(struct MyModel* mymodel);
void free_mymodel(struct MyModel* mymodel);
}
#endif

View file

@ -1,72 +0,0 @@
#!/usr/bin/env python3
import ctypes
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
import numpy as np
import os
libc = cdll.LoadLibrary("./libembdinput.so")
libc.sampling.restype=c_char_p
libc.create_mymodel.restype=c_void_p
libc.eval_string.argtypes=[c_void_p, c_char_p]
libc.sampling.argtypes=[c_void_p]
libc.eval_float.argtypes=[c_void_p, POINTER(c_float), c_int]
class MyModel:
def __init__(self, args):
argc = len(args)
c_str = [c_char_p(i.encode()) for i in args]
args_c = (c_char_p * argc)(*c_str)
self.model = c_void_p(libc.create_mymodel(argc, args_c))
self.max_tgt_len = 512
self.print_string_eval = True
def __del__(self):
libc.free_mymodel(self.model)
def eval_float(self, x):
libc.eval_float(self.model, x.astype(np.float32).ctypes.data_as(POINTER(c_float)), x.shape[1])
def eval_string(self, x):
libc.eval_string(self.model, x.encode()) # c_char_p(x.encode()))
if self.print_string_eval:
print(x)
def eval_token(self, x):
libc.eval_id(self.model, x)
def sampling(self):
s = libc.sampling(self.model)
return s
def stream_generate(self, end="</s>"):
ret = b""
end = end.encode()
for _ in range(self.max_tgt_len):
tmp = self.sampling()
ret += tmp
yield tmp
if ret.endswith(end):
break
def generate_with_print(self, end="</s>"):
ret = b""
for i in self.stream_generate(end=end):
ret += i
print(i.decode(errors="replace"), end="", flush=True)
print("")
return ret.decode(errors="replace")
def generate(self, end="</s>"):
text = b"".join(self.stream_generate(end=end))
return text.decode(errors="replace")
if __name__ == "__main__":
model = MyModel(["main", "--model", "../llama.cpp/models/ggml-vic13b-q4_1.bin", "-c", "2048"])
model.eval_string("""user: what is the color of the flag of UN?""")
x = np.random.random((5120,10))# , dtype=np.float32)
model.eval_float(x)
model.eval_string("""assistant:""")
for i in model.generate():
print(i.decode(errors="replace"), end="", flush=True)

View file

@ -1,71 +0,0 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
from transformers import CLIPVisionModel, CLIPImageProcessor
from PIL import Image
# model parameters from 'liuhaotian/LLaVA-13b-delta-v1-1'
vision_tower = "openai/clip-vit-large-patch14"
select_hidden_state_layer = -2
# (vision_config.image_size // vision_config.patch_size) ** 2
image_token_len = (224//14)**2
class Llava:
def __init__(self, args):
self.image_processor = CLIPImageProcessor.from_pretrained(vision_tower)
self.vision_tower = CLIPVisionModel.from_pretrained(vision_tower)
self.mm_projector = nn.Linear(1024, 5120)
self.model = MyModel(["main", *args])
def load_projection(self, path):
state = torch.load(path)
self.mm_projector.load_state_dict({
"weight": state["model.mm_projector.weight"],
"bias": state["model.mm_projector.bias"]})
def chat(self, question):
self.model.eval_string("user: ")
self.model.eval_string(question)
self.model.eval_string("\nassistant: ")
return self.model.generate_with_print()
def chat_with_image(self, image, question):
with torch.no_grad():
embd_image = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
image_forward_out = self.vision_tower(embd_image.unsqueeze(0), output_hidden_states=True)
select_hidden_state = image_forward_out.hidden_states[select_hidden_state_layer]
image_feature = select_hidden_state[:, 1:]
embd_image = self.mm_projector(image_feature)
embd_image = embd_image.cpu().numpy()[0]
self.model.eval_string("user: ")
self.model.eval_token(32003-2) # im_start
self.model.eval_float(embd_image.T)
for i in range(image_token_len-embd_image.shape[0]):
self.model.eval_token(32003-3) # im_patch
self.model.eval_token(32003-1) # im_end
self.model.eval_string(question)
self.model.eval_string("\nassistant: ")
return self.model.generate_with_print()
if __name__=="__main__":
# model form liuhaotian/LLaVA-13b-delta-v1-1
a = Llava(["--model", "./models/ggml-llava-13b-v1.1.bin", "-c", "2048"])
# Extract from https://huggingface.co/liuhaotian/LLaVA-13b-delta-v1-1/blob/main/pytorch_model-00003-of-00003.bin.
# Also here can use pytorch_model-00003-of-00003.bin directly.
a.load_projection(os.path.join(
os.path.dirname(__file__) ,
"llava_projection.pth"))
respose = a.chat_with_image(
Image.open("./media/llama1-logo.png").convert('RGB'),
"what is the text in the picture?")
respose
a.chat("what is the color of it?")

View file

@ -1,129 +0,0 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
from PIL import Image
minigpt4_path = os.path.join(os.path.dirname(__file__), "MiniGPT-4")
sys.path.insert(0, minigpt4_path)
from minigpt4.models.blip2 import Blip2Base
from minigpt4.processors.blip_processors import Blip2ImageEvalProcessor
class MiniGPT4(Blip2Base):
"""
MiniGPT4 model from https://github.com/Vision-CAIR/MiniGPT-4
"""
def __init__(self,
args,
vit_model="eva_clip_g",
q_former_model="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth",
img_size=224,
drop_path_rate=0,
use_grad_checkpoint=False,
vit_precision="fp32",
freeze_vit=True,
freeze_qformer=True,
num_query_token=32,
llama_model="",
prompt_path="",
prompt_template="",
max_txt_len=32,
end_sym='\n',
low_resource=False, # use 8 bit and put vit in cpu
device_8bit=0
):
super().__init__()
self.img_size = img_size
self.low_resource = low_resource
self.preprocessor = Blip2ImageEvalProcessor(img_size)
print('Loading VIT')
self.visual_encoder, self.ln_vision = self.init_vision_encoder(
vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
)
print('Loading VIT Done')
print('Loading Q-Former')
self.Qformer, self.query_tokens = self.init_Qformer(
num_query_token, self.visual_encoder.num_features
)
self.Qformer.cls = None
self.Qformer.bert.embeddings.word_embeddings = None
self.Qformer.bert.embeddings.position_embeddings = None
for layer in self.Qformer.bert.encoder.layer:
layer.output = None
layer.intermediate = None
self.load_from_pretrained(url_or_filename=q_former_model)
print('Loading Q-Former Done')
self.llama_proj = nn.Linear(
self.Qformer.config.hidden_size, 5120 # self.llama_model.config.hidden_size
)
self.max_txt_len = max_txt_len
self.end_sym = end_sym
self.model = MyModel(["main", *args])
# system prompt
self.model.eval_string("Give the following image: <Img>ImageContent</Img>. "
"You will be able to see the image once I provide it to you. Please answer my questions."
"###")
def encode_img(self, image):
image = self.preprocessor(image)
image = image.unsqueeze(0)
device = image.device
if self.low_resource:
self.vit_to_cpu()
image = image.to("cpu")
with self.maybe_autocast():
image_embeds = self.ln_vision(self.visual_encoder(image)).to(device)
image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(device)
query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
query_output = self.Qformer.bert(
query_embeds=query_tokens,
encoder_hidden_states=image_embeds,
encoder_attention_mask=image_atts,
return_dict=True,
)
inputs_llama = self.llama_proj(query_output.last_hidden_state)
# atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(image.device)
return inputs_llama
def load_projection(self, path):
state = torch.load(path)["model"]
self.llama_proj.load_state_dict({
"weight": state["llama_proj.weight"],
"bias": state["llama_proj.bias"]})
def chat(self, question):
self.model.eval_string("Human: ")
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
return self.model.generate_with_print(end="###")
def chat_with_image(self, image, question):
with torch.no_grad():
embd_image = self.encode_img(image)
embd_image = embd_image.cpu().numpy()[0]
self.model.eval_string("Human: <Img>")
self.model.eval_float(embd_image.T)
self.model.eval_string("</Img> ")
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
return self.model.generate_with_print(end="###")
if __name__=="__main__":
a = MiniGPT4(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048"])
a.load_projection(os.path.join(
os.path.dirname(__file__) ,
"pretrained_minigpt4.pth"))
respose = a.chat_with_image(
Image.open("./media/llama1-logo.png").convert('RGB'),
"what is the text in the picture?")
a.chat("what is the color of it?")

View file

@ -1,99 +0,0 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))
from embd_input import MyModel
import numpy as np
from torch import nn
import torch
# use PandaGPT path
panda_gpt_path = os.path.join(os.path.dirname(__file__), "PandaGPT")
imagebind_ckpt_path = "./models/panda_gpt/"
sys.path.insert(0, os.path.join(panda_gpt_path,"code","model"))
from ImageBind.models import imagebind_model
from ImageBind import data
ModalityType = imagebind_model.ModalityType
max_tgt_len = 400
class PandaGPT:
def __init__(self, args):
self.visual_encoder,_ = imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
self.visual_encoder.eval()
self.llama_proj = nn.Linear(1024, 5120) # self.visual_hidden_size, 5120)
self.max_tgt_len = max_tgt_len
self.model = MyModel(["main", *args])
self.generated_text = ""
self.device = "cpu"
def load_projection(self, path):
state = torch.load(path, map_location="cpu")
self.llama_proj.load_state_dict({
"weight": state["llama_proj.weight"],
"bias": state["llama_proj.bias"]})
def eval_inputs(self, inputs):
self.model.eval_string("<Img>")
embds = self.extract_multimoal_feature(inputs)
for i in embds:
self.model.eval_float(i.T)
self.model.eval_string("</Img> ")
def chat(self, question):
return self.chat_with_image(None, question)
def chat_with_image(self, inputs, question):
if self.generated_text == "":
self.model.eval_string("###")
self.model.eval_string(" Human: ")
if inputs:
self.eval_inputs(inputs)
self.model.eval_string(question)
self.model.eval_string("\n### Assistant:")
ret = self.model.generate_with_print(end="###")
self.generated_text += ret
return ret
def extract_multimoal_feature(self, inputs):
features = []
for key in ["image", "audio", "video", "thermal"]:
if key + "_paths" in inputs:
embeds = self.encode_data(key, inputs[key+"_paths"])
features.append(embeds)
return features
def encode_data(self, data_type, data_paths):
type_map = {
"image": ModalityType.VISION,
"audio": ModalityType.AUDIO,
"video": ModalityType.VISION,
"thermal": ModalityType.THERMAL,
}
load_map = {
"image": data.load_and_transform_vision_data,
"audio": data.load_and_transform_audio_data,
"video": data.load_and_transform_video_data,
"thermal": data.load_and_transform_thermal_data
}
load_function = load_map[data_type]
key = type_map[data_type]
inputs = {key: load_function(data_paths, self.device)}
with torch.no_grad():
embeddings = self.visual_encoder(inputs)
embeds = embeddings[key]
embeds = self.llama_proj(embeds).cpu().numpy()
return embeds
if __name__=="__main__":
a = PandaGPT(["--model", "./models/ggml-vicuna-13b-v0-q4_1.bin", "-c", "2048", "--lora", "./models/panda_gpt/ggml-adapter-model.bin","--temp", "0"])
a.load_projection("./models/panda_gpt/adapter_model.bin")
a.chat_with_image(
{"image_paths": ["./media/llama1-logo.png"]},
"what is the text in the picture? 'llama' or 'lambda'?")
a.chat("what is the color of it?")

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -4,5 +4,5 @@ install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -39,8 +39,8 @@ static gpt_params * g_params;
static std::vector<llama_token> * g_input_tokens;
static std::ostringstream * g_output_ss;
static std::vector<llama_token> * g_output_tokens;
static bool is_interacting = false;
static bool is_interacting = false;
static void write_logfile(
const llama_context * ctx, const gpt_params & params, const llama_model * model,
@ -104,7 +104,7 @@ static void sigint_handler(int signo) {
int main(int argc, char ** argv) {
gpt_params params;
llama_sampling_params & sparams = params.sampling_params;
llama_sampling_params & sparams = params.sparams;
g_params = &params;
if (!gpt_params_parse(argc, argv, params)) {
@ -246,14 +246,14 @@ int main(int argc, char ** argv) {
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
}
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
if (add_bos) {
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
}
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = inp_pfx;
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(ctx));
embd_inp.push_back(llama_token_middle(model));
LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix));
LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix));
@ -261,7 +261,7 @@ int main(int argc, char ** argv) {
// Should not run without any tokens
if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(ctx));
embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
}
@ -358,36 +358,10 @@ int main(int argc, char ** argv) {
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
}
}
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n");
struct llama_grammar * grammar = NULL;
grammar_parser::parse_state parsed_grammar;
if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
return 1;
}
LOG_TEE("%s: grammar:\n", __func__);
grammar_parser::print_grammar(stderr, parsed_grammar);
LOG_TEE("\n");
{
auto it = sparams.logit_bias.find(llama_token_eos(ctx));
if (it != sparams.logit_bias.end() && it->second == -INFINITY) {
LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
}
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
}
LOG_TEE("\n##### Infill mode #####\n\n");
if (params.infill) {
printf("\n************\n");
@ -430,7 +404,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
while (n_remain != 0 || params.interactive) {
// predict
@ -549,7 +523,7 @@ int main(int argc, char ** argv) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
llama_sampling_accept(ctx_sampling, ctx, id);
llama_sampling_accept(ctx_sampling, ctx, id, true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
@ -567,8 +541,11 @@ int main(int argc, char ** argv) {
LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed);
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
ctx_sampling->prev.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
@ -600,10 +577,10 @@ int main(int argc, char ** argv) {
if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode
if ((ctx_sampling->prev.back() == llama_token_eot(ctx) || is_interacting) && params.interactive){
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if(is_interacting && !params.interactive_first) {
// print an eot token
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
}
fflush(stdout);
printf("\n");
@ -617,7 +594,7 @@ int main(int argc, char ** argv) {
buffer += line;
} while (another_line);
// check if we got an empty line, if so we use the old input
if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
params.input_prefix = buffer;
}
buffer.clear();
@ -627,7 +604,7 @@ int main(int argc, char ** argv) {
buffer += line;
} while (another_line);
// check if we got an empty line
if(!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) {
params.input_suffix = buffer;
}
buffer.clear();
@ -640,7 +617,7 @@ int main(int argc, char ** argv) {
process_escapes(params.input_suffix);
}
suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
params.input_suffix.erase(0, 1);
suff_rm_leading_spc = false;
}
@ -650,14 +627,14 @@ int main(int argc, char ** argv) {
if (suff_rm_leading_spc && inp_sfx[0] == space_token) {
inp_sfx.erase(inp_sfx.begin());
}
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx));
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
if (add_bos) {
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx));
inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model));
}
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx));
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
embd_inp = inp_pfx;
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(ctx));
embd_inp.push_back(llama_token_middle(model));
embd.clear();
embd_guidance.clear();
n_remain = params.n_predict;
@ -667,7 +644,7 @@ int main(int argc, char ** argv) {
is_interacting = false;
}
// deal with end of text token in interactive mode
else if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
LOG("found EOS token\n");
if (params.interactive) {
@ -684,7 +661,7 @@ int main(int argc, char ** argv) {
if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(ctx));
embd_inp.push_back(llama_token_bos(model));
}
std::string buffer;
@ -740,22 +717,14 @@ int main(int argc, char ** argv) {
if (n_past > 0) {
if (is_interacting) {
// reset grammar state if we're restarting generation
if (grammar != NULL) {
llama_grammar_free(grammar);
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(),
parsed_grammar.symbol_ids.at("root"));
}
llama_sampling_reset(ctx_sampling);
}
is_interacting = false;
}
}
// end of text token
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) {
if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) {
break;
}
@ -767,7 +736,7 @@ int main(int argc, char ** argv) {
}
}
if (!params.interactive && n_remain <= 0) {
printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str());
printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
fflush(stdout);
}
@ -778,9 +747,7 @@ int main(int argc, char ** argv) {
llama_free(ctx);
llama_free_model(model);
if (grammar != NULL) {
llama_grammar_free(grammar);
}
llama_sampling_free(ctx_sampling);
llama_backend_free();
#ifndef LOG_DISABLE_LOGS

View file

@ -933,7 +933,7 @@ struct sql_printer : public printer {
};
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
int n_processed = 0;
llama_set_n_threads(ctx, n_threads, n_threads);
@ -946,7 +946,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
}
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
llama_token token = llama_token_bos(ctx);
llama_token token = llama_token_bos(llama_get_model(ctx));
llama_set_n_threads(ctx, n_threads, n_threads);

View file

@ -1,7 +1,7 @@
set(TARGET clip)
add_library(${TARGET} clip.cpp clip.h)
install(TARGETS ${TARGET} LIBRARY)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if (NOT MSVC)
target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h

View file

@ -610,8 +610,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
for (int i = 0; i < 3; ++i) {
new_clip->image_mean[i] = *((float *)gguf_get_arr_data(ctx, idx_mean));
new_clip->image_std[i] = *((float *)gguf_get_arr_data(ctx, idx_std));
new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
}
if (verbosity >= 2) {

View file

@ -16,13 +16,29 @@ checkpoint = torch.load(path)
mm_tensors = [k for k, v in checkpoint.items() if k.startswith("model.mm_projector")]
# store these tensors in a new dictionary and torch.save them
projector = {name: checkpoint[name] for name in mm_tensors}
projector = {name: checkpoint[name].float() for name in mm_tensors}
torch.save(projector, f"{args.model}/llava.projector")
# remove these tensors from the checkpoint and save it again
for name in mm_tensors:
del checkpoint[name]
# BakLLaVA models contain CLIP tensors in it
clip_tensors = [k for k, v in checkpoint.items() if k.startswith("model.vision_tower")]
if len(clip_tensors) > 0:
clip = {name.replace("vision_tower.vision_tower.", ""): checkpoint[name].float() for name in clip_tensors}
torch.save(clip, f"{args.model}/llava.clip")
# remove these tensors
for name in clip_tensors:
del checkpoint[name]
# added tokens should be removed to be able to convert Mistral models
if os.path.exists(f"{args.model}/added_tokens.json"):
with open(f"{args.model}/added_tokens.json", "w") as f:
f.write("{}\n")
torch.save(checkpoint, path)
print("Done!")

View file

@ -58,28 +58,30 @@ inline bool eval_string(struct llama_context * ctx_llama, const char* str, int n
// TODO: use common/sampling.h
inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
// out of user input, sample next token
const float temp = params.sampling_params.temp;
const int32_t top_k = params.sampling_params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : params.sampling_params.top_k;
const float top_p = params.sampling_params.top_p;
const float tfs_z = params.sampling_params.tfs_z;
const float typical_p = params.sampling_params.typical_p;
// const int32_t repeat_last_n = params.sampling_params.repeat_last_n < 0 ? n_ctx : params.sampling_params.repeat_last_n;
// const float repeat_penalty = params.sampling_params.repeat_penalty;
// const float alpha_presence = params.sampling_params.presence_penalty;
// const float alpha_frequency = params.sampling_params.frequency_penalty;
const int mirostat = params.sampling_params.mirostat;
const float mirostat_tau = params.sampling_params.mirostat_tau;
const float mirostat_eta = params.sampling_params.mirostat_eta;
// const bool penalize_nl = params.sampling_params.penalize_nl;
auto & sparams = params.sparams;
// out of user input, sample next token
const float temp = sparams.temp;
const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx_llama)) : sparams.top_k;
const float top_p = sparams.top_p;
const float tfs_z = sparams.tfs_z;
const float typical_p = sparams.typical_p;
// const int32_t repeat_last_n = sparams.repeat_last_n < 0 ? n_ctx : sparams.repeat_last_n;
// const float repeat_penalty = sparams.repeat_penalty;
// const float alpha_presence = sparams.presence_penalty;
// const float alpha_frequency = sparams.frequency_penalty;
const int mirostat = sparams.mirostat;
const float mirostat_tau = sparams.mirostat_tau;
const float mirostat_eta = sparams.mirostat_eta;
// const bool penalize_nl = sparams.penalize_nl;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx_llama);
auto n_vocab = llama_n_vocab(llama_get_model(ctx_llama));
// Apply params.logit_bias map
for (auto it = params.sampling_params.logit_bias.begin(); it != params.sampling_params.logit_bias.end(); it++) {
// Apply params.logit_bias map
for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) {
logits[it->first] += it->second;
}
@ -91,18 +93,18 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// TODO: Apply penalties
// float nl_logit = logits[llama_token_nl(ctx)];
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
// llama_sample_repetition_penalty(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, repeat_penalty);
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence);
// if (!penalize_nl) {
// logits[llama_token_nl(ctx)] = nl_logit;
// }
// TODO: Apply penalties
// float nl_logit = logits[llama_token_nl(ctx)];
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
// llama_sample_repetition_penalty(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, repeat_penalty);
// llama_sample_frequency_and_presence_penalties(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence);
// if (!penalize_nl) {
// logits[llama_token_nl(ctx)] = nl_logit;
// }
if (temp <= 0) {
// Greedy sampling
@ -135,7 +137,7 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) {
inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) {
int id = sample_id(ctx_llama, params);
static std::string ret;
if (id == llama_token_eos(ctx_llama)) {
if (id == llama_token_eos(llama_get_model(ctx_llama))) {
ret = "</s>";
} else {
ret = llama_token_to_piece(ctx_llama, id);

View file

@ -16,6 +16,8 @@ add_library(common OBJECT
${_common_path}/console.cpp
${_common_path}/grammar-parser.h
${_common_path}/grammar-parser.cpp
${_common_path}/sampling.h
${_common_path}/sampling.cpp
)
# WARNING: because build-info.h is auto-generated, it will only

View file

@ -108,7 +108,7 @@ int main(int argc, char ** argv) {
if (!gpt_params_parse(argc, argv, params)) {
return 1;
}
llama_sampling_params & sparams = params.sampling_params;
llama_sampling_params & sparams = params.sparams;
#ifndef LOG_DISABLE_LOGS
log_set_target(log_filename_generator("main", "log"));
@ -248,7 +248,7 @@ int main(int argc, char ** argv) {
// Should not run without any tokens
if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(ctx));
embd_inp.push_back(llama_token_bos(model));
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
}
@ -415,8 +415,7 @@ int main(int argc, char ** argv) {
}
}
}
LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau);
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
LOG_TEE("\n\n");
@ -459,7 +458,7 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
@ -612,7 +611,7 @@ int main(int argc, char ** argv) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
llama_sampling_accept(ctx_sampling, ctx, id);
llama_sampling_accept(ctx_sampling, ctx, id, true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
@ -631,12 +630,9 @@ int main(int argc, char ** argv) {
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
// GG: I'm not sure it's a good idea to push the prompt tokens into the sampling context
// Most likely will remove this in the future to avoid exposing "prev"
// Same thing is done in "server". If we stop pushing the prompt tokens, then the repetition
// penalty will be applied only based on the tokens generated by the model.
ctx_sampling->prev.erase(ctx_sampling->prev.begin());
ctx_sampling->prev.push_back(embd_inp[n_consumed]);
// push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
@ -667,12 +663,10 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
// check for reverse prompt
// check for reverse prompt in the last n_prev tokens
if (!params.antiprompt.empty()) {
std::string last_output;
for (auto id : ctx_sampling->prev) {
last_output += llama_token_to_piece(ctx, id);
}
const int n_prev = 32;
const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev);
is_antiprompt = false;
// Check if each of the reverse prompts appears at the end of the output.
@ -699,7 +693,7 @@ int main(int argc, char ** argv) {
}
// deal with end of text token in interactive mode
if (ctx_sampling->prev.back() == llama_token_eos(ctx)) {
if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) {
LOG("found EOS token\n");
if (params.interactive) {
@ -726,7 +720,7 @@ int main(int argc, char ** argv) {
if (params.input_prefix_bos) {
LOG("adding input prefix BOS token\n");
embd_inp.push_back(llama_token_bos(ctx));
embd_inp.push_back(llama_token_bos(model));
}
std::string buffer;
@ -767,6 +761,9 @@ int main(int argc, char ** argv) {
n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
}
if (params.escape) {
process_escapes(buffer);
}
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
@ -807,7 +804,7 @@ int main(int argc, char ** argv) {
}
// end of text token
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
LOG_TEE(" [end of text]\n");
break;
}

View file

@ -157,7 +157,7 @@ int main(int argc, char ** argv) {
for (size_t i = 0; i < clients.size(); ++i) {
auto & client = clients[i];
client.id = i;
client.ctx_sampling = llama_sampling_init(params);
client.ctx_sampling = llama_sampling_init(params.sparams);
}
std::vector<llama_token> tokens_system;
@ -330,7 +330,7 @@ int main(int argc, char ** argv) {
const llama_token id = llama_sampling_sample(client.ctx_sampling, ctx, NULL, client.i_batch - i);
llama_sampling_accept(client.ctx_sampling, ctx, id);
llama_sampling_accept(client.ctx_sampling, ctx, id, true);
if (client.n_decoded == 1) {
// start measuring generation time after the first token to make sure all concurrent clients
@ -347,7 +347,7 @@ int main(int argc, char ** argv) {
// client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str());
if (client.n_decoded > 2 &&
(id == llama_token_eos(ctx) ||
(id == llama_token_eos(model) ||
(params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) ||
client.response.find("User:") != std::string::npos ||
client.response.find('\n') != std::string::npos)) {

View file

@ -227,7 +227,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(ctx);
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
const auto batch_logits = llama_get_logits(ctx);
@ -350,7 +350,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
// add BOS token for the first batch of each chunk
if (add_bos && j == 0) {
tokens[batch_start] = llama_token_bos(ctx);
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {

View file

@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
target_compile_definitions(${TARGET} PRIVATE
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
if (WIN32)
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
endif()

View file

@ -24,6 +24,10 @@ Command line options:
- `--port`: Set the port to listen. Default: `8080`.
- `--path`: path from which to serve static files (default examples/server/public)
- `--embedding`: Enable embedding extraction, Default: disabled.
- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
## Build
@ -158,6 +162,8 @@ node index.js
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
*Result JSON:*
Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
@ -188,6 +194,12 @@ node index.js
`truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
`slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
`cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- **POST** `/tokenize`: Tokenize a given text.
*Options:*
@ -218,8 +230,32 @@ node index.js
It also accepts all the options of `/completion` except `stream` and `prompt`.
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
## More examples
### Change system prompt on runtime
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
`prompt`: Specify a context that you want all connecting clients to respect.
`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
```json
{
"system_prompt": {
"prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
"anti_prompt": "User:",
"assistant_name": "Assistant:"
}
}
```
**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
### Interactive mode
Check the sample in [chat.mjs](chat.mjs).

View file

@ -8,6 +8,7 @@ import json
app = Flask(__name__)
slot_id = -1
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
@ -77,7 +78,8 @@ def make_postData(body, chat=False, stream=False):
if(is_present(body, "stop")): postData["stop"] += body["stop"]
postData["n_keep"] = -1
postData["stream"] = stream
postData["cache_prompt"] = True
postData["slot_id"] = slot_id
return postData
def make_resData(data, chat=False, promptToken=[]):
@ -128,6 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
}
]
}
slot_id = data["slot_id"]
if (chat):
if (start):
resData["choices"][0]["delta"] = {

View file

@ -7,6 +7,11 @@ const args = process.argv.slice(2);
const grammarJsonSchemaFile = args.find(
(_, index) => args[index - 1] === "--grammar-json-schema"
);
const no_cached_prompt = args.find(
(_, index) => args[index - 1] === "--no-cache-prompt"
) ?? "false";
const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
// Example usage: function,arguments
@ -30,6 +35,9 @@ if (grammarFile) {
grammar = readFileSync(grammarFile, 'utf-8')
}
// for cached prompt
let slot_id = -1;
const API_URL = 'http://127.0.0.1:8080'
const chat = [
@ -76,6 +84,8 @@ async function chat_completion(question) {
top_p: 0.9,
n_keep: n_keep,
n_predict: 256,
cache_prompt: no_cached_prompt === "false",
slot_id: slot_id,
stop: ["\n### Human:"], // stop completion after generating this
grammar,
stream: true,
@ -92,6 +102,7 @@ async function chat_completion(question) {
const t = Buffer.from(chunk).toString('utf8')
if (t.startsWith('data: ')) {
const message = JSON.parse(t.substring(6))
slot_id = message.slot_id
answer += message.content
process.stdout.write(message.content)
if (message.stop) {

File diff suppressed because it is too large Load diff

View file

@ -125,6 +125,7 @@
background-color: #222;
color: #ddd;
}
code {
font-family: monospace;
padding: 0.1em 0.3em;
@ -141,7 +142,8 @@
display: inline;
}
header, footer {
header,
footer {
text-align: center;
}
@ -163,6 +165,7 @@
0% {
background-position: 0%;
}
100% {
background-position: 100%;
}
@ -181,6 +184,7 @@
--loading-color-1: #22222200;
--loading-color-2: #222222ff;
}
.popover-content {
background-color: black;
}
@ -194,6 +198,8 @@
import { llama } from '/completion.js';
import { SchemaConverter } from '/json-schema-to-grammar.mjs';
let selected_image = false;
var slot_id = -1;
const session = signal({
prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
@ -203,6 +209,7 @@
type: "chat", // "chat" | "completion"
char: "Llama",
user: "User",
image_selected: ''
})
const params = signal({
@ -220,7 +227,9 @@
mirostat_tau: 5, // target entropy
mirostat_eta: 0.1, // learning rate
grammar: '',
n_probs: 0, // no completion_probabilities
n_probs: 0, // no completion_probabilities,
image_data: [],
cache_prompt: true
})
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */
@ -270,6 +279,7 @@
// saved templates were successfuly imported.
console.log('Processing saved templates and updating default template')
params.value = { ...params.value, image_data: [] };
//console.log(importedTemplates);
savedUserTemplates.value = importedTemplates;
@ -294,7 +304,9 @@
function userTemplateApply(t) {
session.value = t.data.session;
session.value = { ...session.value, image_selected: '' };
params.value = t.data.params;
params.value = { ...params.value, image_data: [] };
}
function userTemplateResetToDefaultAndApply() {
@ -385,20 +397,25 @@
throw new Error("already running");
}
controller.value = new AbortController();
for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
const data = chunk.data;
if (data.stop) {
while (
currentMessages.length > 0 &&
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
) {
) {
currentMessages.pop();
}
transcriptUpdate([...history, [char, currentMessages]])
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
} else {
currentMessages.push(data);
slot_id = data.slot_id;
if (selected_image && !data.multimodal) {
alert("The server was not compiled for multimodal or the model projector can't be loaded.");
return;
}
transcriptUpdate([...history, [char, currentMessages]])
}
@ -419,7 +436,7 @@
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
const prompt = template(session.value.template, {
let prompt = template(session.value.template, {
message: msg,
history: session.value.transcript.flatMap(
([name, data]) =>
@ -434,9 +451,12 @@
)
).join("\n"),
});
if (selected_image) {
prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
}
await runLlama(prompt, {
...params.value,
slot_id: slot_id,
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
}, "{{char}}");
}
@ -446,10 +466,11 @@
console.log('already running...');
return;
}
const {prompt} = session.value;
const { prompt } = session.value;
transcriptUpdate([...session.value.transcript, ["", prompt]]);
await runLlama(prompt, {
...params.value,
slot_id: slot_id,
stop: [],
}, "");
}
@ -467,6 +488,27 @@
transcriptUpdate([]);
}
const uploadImage = (e) => {
e.preventDefault();
document.getElementById("fileInput").click();
document.getElementById("fileInput").addEventListener("change", function (event) {
const selectedFile = event.target.files[0];
if (selectedFile) {
const reader = new FileReader();
reader.onload = function () {
const image_data = reader.result;
session.value = { ...session.value, image_selected: image_data };
params.value = {
...params.value, image_data: [
{ data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
}
};
selected_image = true;
reader.readAsDataURL(selectedFile);
}
});
}
function MessageInput() {
const message = useSignal("")
@ -497,6 +539,7 @@
</div>
<div class="right">
<button type="submit" disabled=${generating.value}>Send</button>
<button onclick=${uploadImage}>Upload Image</button>
<button onclick=${stop} disabled=${!generating.value}>Stop</button>
<button onclick=${reset}>Reset</button>
</div>
@ -540,7 +583,7 @@
data;
message = html`<${Markdownish} text=${template(text)} />`
}
if(user) {
if (user) {
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
} else {
return html`<p key=${index}>${message}</p>`
@ -549,6 +592,7 @@
return html`
<section id="chat" ref=${container}>
<img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
${messages.flatMap(chatLine)}
</section>`;
};
@ -567,7 +611,7 @@
const converter = new SchemaConverter(
grammarJsonSchemaPropOrder.value
.split(',')
.reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {})
.reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {})
)
converter.visit(schema, '')
params.value = {
@ -579,7 +623,7 @@
}
}
const FloatField = ({label, max, min, name, step, value}) => {
const FloatField = ({ label, max, min, name, step, value }) => {
return html`
<div>
<label for="${name}">${label}</label>
@ -589,7 +633,7 @@
`
};
const IntField = ({label, max, min, name, value}) => {
const IntField = ({ label, max, min, name, value }) => {
return html`
<div>
<label for="${name}">${label}</label>
@ -672,7 +716,7 @@
${GrammarControl()}
</fieldset>
`
);
);
const CompletionConfigForm = () => (
html`
@ -694,20 +738,20 @@
${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}
<fieldset class="two">
${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
${FloatField({label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty})}
${IntField({label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n})}
${IntField({label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k})}
${FloatField({label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p})}
${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
</fieldset>
<details>
<summary>More options</summary>
<fieldset class="two">
${FloatField({label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z})}
${FloatField({label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p})}
${FloatField({label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty})}
${FloatField({label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty})}
${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
</fieldset>
<hr />
<fieldset class="three">
@ -716,11 +760,11 @@
<label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
<label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
</div>
${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
</fieldset>
<fieldset>
${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
</fieldset>
</details>
</form>
@ -759,20 +803,20 @@
const popoverChildren = html`
<div class="prob-set">
${probs.map((p, index) => {
return html`
return html`
<div
key=${index}
title=${`prob: ${p.prob}`}
style=${{
padding: '0.3em',
backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
}}
padding: '0.3em',
backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
}}
>
<span>${p.tok_str}: </span>
<span>${Math.floor(p.prob * 100)}%</span>
</div>
`
})}
})}
</div>
`
@ -851,9 +895,9 @@
ref=${popoverRef}
class="popover-content"
style=${{
top: position.value.top,
left: position.value.left,
}}
top: position.value.top,
left: position.value.left,
}}
>
${props.popoverChildren}
</div>
@ -952,8 +996,11 @@
</head>
<body>
<div id="container"></div>
<div id="container">
<input type="file" id="fileInput" accept="image/*" style="display: none;">
</div>
<div id="portal"></div>
</body>
</html>

File diff suppressed because it is too large Load diff

View file

@ -138,7 +138,7 @@ int main(int argc, char ** argv) {
const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of stream?
if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) {
if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
LOG_TEE("\n");
break;

View file

@ -112,16 +112,16 @@ int main(int argc, char ** argv) {
bool has_eos = false;
// target model sampling context
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params);
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
// draft sequence data
std::vector<seq_draft> drafts(n_seq_dft);
params.grammar.clear(); // the draft samplers will copy the target sampler's grammar
params.sampling_params.temp = std::max(0.01f, params.sampling_params.temp);
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
params.sparams.temp = std::max(0.01f, params.sparams.temp);
for (int s = 0; s < n_seq_dft; ++s) {
drafts[s].ctx_sampling = llama_sampling_init(params);
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);
}
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
@ -154,7 +154,7 @@ int main(int argc, char ** argv) {
// sample from the target model
llama_token id = llama_sampling_sample(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]);
llama_sampling_accept(ctx_sampling, ctx_tgt, id);
llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
//LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_tgt, ctx_sampling->prev).c_str());
@ -163,7 +163,7 @@ int main(int argc, char ** argv) {
printf("%s", token_str.c_str());
fflush(stdout);
if (id == llama_token_eos(ctx_tgt)) {
if (id == llama_token_eos(model_tgt)) {
has_eos = true;
}
@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
const int s = sa[is];
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id);
llama_sampling_accept(drafts[s].ctx_sampling, ctx_dft, id, true);
drafts[s].tokens.push_back(id);

View file

@ -29,6 +29,8 @@
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasCreate hipblasCreate
#define cublasGemmEx hipblasGemmEx
#define cublasGemmBatchedEx hipblasGemmBatchedEx
#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
#define cublasHandle_t hipblasHandle_t
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetStream hipblasSetStream
@ -4326,13 +4328,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
const half * x = (const half *) vx;
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
const int channel_x = channel / channel_x_divisor;
const int nrows_y = ncols_x;
const int nrows_y = ncols_x;
const int nrows_dst = nrows_x;
const int row_dst = row_x;
const int row_dst = row_x;
const int idst = channel*nrows_dst + row_dst;
@ -4345,13 +4347,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
break;
}
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
const float xi = __half2float(x[ix]);
const int row_y = col_x;
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
const int iy = channel*nrows_y + row_y;
const float xi = __half2float(x[ix]);
tmp += xi * y[iy];
}
@ -5658,10 +5660,10 @@ void ggml_init_cublas() {
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0;
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
for (int64_t id = 0; id < g_device_count; ++id) {
for (int id = 0; id < g_device_count; ++id) {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
g_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
@ -5671,15 +5673,15 @@ void ggml_init_cublas() {
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}
for (int64_t id = 0; id < g_device_count; ++id) {
for (int id = 0; id < g_device_count; ++id) {
g_tensor_split[id] /= total_vram;
}
for (int64_t id = 0; id < g_device_count; ++id) {
for (int id = 0; id < g_device_count; ++id) {
CUDA_CHECK(ggml_cuda_set_device(id));
// create cuda streams
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
for (int is = 0; is < MAX_STREAMS; ++is) {
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
}
@ -6248,16 +6250,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
GGML_ASSERT(src0_dd_i != nullptr);
GGML_ASSERT(src0_dd_i != nullptr);
GGML_ASSERT(src1_ddf_i != nullptr);
GGML_ASSERT(dst_dd_i != nullptr);
GGML_ASSERT(dst_dd_i != nullptr);
const int64_t ne00 = src0->ne[0];
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
const int64_t row_diff = row_high - row_low;
int id;
@ -7009,7 +7010,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens
}
static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(!ggml_is_permuted(src0));
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT(src0->type == GGML_TYPE_F16);
@ -7019,11 +7021,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne12 = src1->ne[2];
const int64_t nb01 = src0->nb[1];
const int64_t nb02 = src0->nb[2];
const int64_t ne12 = src1->ne[2];
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
@ -7042,6 +7044,159 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
}
static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t nb01 = src0->nb[1];
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
const int64_t nb11 = src1->nb[1];
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
const int64_t ne1 = ggml_nelements(src1);
const int64_t ne = ggml_nelements(dst);
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
int id;
CUDA_CHECK(cudaGetDevice(&id));
CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream));
ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
void * src0_ddq = src0_extra->data_device[g_main_device];
half * src0_as_f16 = (half *) src0_ddq;
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
// convert src1 to fp16
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
GGML_ASSERT(to_fp16_cuda != nullptr);
size_t src1_as = 0;
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
size_t dst_as = 0;
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0);
// broadcast factors
const int64_t r2 = ne12/ne02;
const int64_t r3 = ne13/ne03;
const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f;
#if 0
// use cublasGemmEx
{
for (int i13 = 0; i13 < ne13; ++i13) {
for (int i12 = 0; i12 < ne12; ++i12) {
int i03 = i13 / r3;
int i02 = i12 / r2;
CUBLAS_CHECK(
cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
(const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
&beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
CUBLAS_COMPUTE_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
}
}
}
#else
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
// use cublasGemmStridedBatchedEx
CUBLAS_CHECK(
cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA
(const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB
&beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC
ne12*ne13,
CUBLAS_COMPUTE_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} else {
// use cublasGemmBatchedEx
// TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000
const int ne23 = ne12*ne13;
// TODO: avoid this alloc
void ** ptrs = (void **) malloc(3*ne23*sizeof(void *));
for (int i13 = 0; i13 < ne13; ++i13) {
for (int i12 = 0; i12 < ne12; ++i12) {
int i03 = i13 / r3;
int i02 = i12 / r2;
ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3];
ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2;
ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2;
}
}
// allocate device memory for pointers
void ** ptrs_as = nullptr;
CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *)));
// TODO: this does not work for some reason -- not sure why?
//size_t ptrs_s = 0;
//ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s);
// copy pointers to device
CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice));
free(ptrs);
CUBLAS_CHECK(
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half),
(const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float),
&beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01,
ne23,
CUBLAS_COMPUTE_16F,
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
// free device memory for pointers
CUDA_CHECK(cudaFree(ptrs_as));
//ggml_cuda_pool_free(ptrs_as, ptrs_s);
}
#endif
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
ggml_cuda_pool_free(src1_as_f16, src1_as);
ggml_cuda_pool_free(dst_f16, dst_as);
}
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
@ -7054,10 +7209,23 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
}
}
// debug helpers
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// KQ single-batch
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// KQ + KQV multi-batch
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
} else if (src0->type == GGML_TYPE_F32) {
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {

View file

@ -62,6 +62,7 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(mul);
GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast
GGML_METAL_DECL_KERNEL(scale);
GGML_METAL_DECL_KERNEL(scale_4);
GGML_METAL_DECL_KERNEL(silu);
GGML_METAL_DECL_KERNEL(relu);
GGML_METAL_DECL_KERNEL(gelu);
@ -249,6 +250,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(mul);
GGML_METAL_ADD_KERNEL(mul_row);
GGML_METAL_ADD_KERNEL(scale);
GGML_METAL_ADD_KERNEL(scale_4);
GGML_METAL_ADD_KERNEL(silu);
GGML_METAL_ADD_KERNEL(relu);
GGML_METAL_ADD_KERNEL(gelu);
@ -347,6 +349,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(mul);
GGML_METAL_DEL_KERNEL(mul_row);
GGML_METAL_DEL_KERNEL(scale);
GGML_METAL_DEL_KERNEL(scale_4);
GGML_METAL_DEL_KERNEL(silu);
GGML_METAL_DEL_KERNEL(relu);
GGML_METAL_DEL_KERNEL(gelu);
@ -923,15 +926,20 @@ void ggml_metal_graph_compute(
const float scale = *(const float *) src1->data;
[encoder setComputePipelineState:ctx->pipeline_scale];
int64_t n = ggml_nelements(dst);
if (n % 4 == 0) {
n /= 4;
[encoder setComputePipelineState:ctx->pipeline_scale_4];
} else {
[encoder setComputePipelineState:ctx->pipeline_scale];
}
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
const int64_t n = ggml_nelements(dst);
GGML_ASSERT(n % 4 == 0);
[encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
} break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(gf->nodes[i])) {

View file

@ -125,9 +125,17 @@ kernel void kernel_mul_row(
}
kernel void kernel_scale(
device const float * src0,
device float * dst,
constant float & scale,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] * scale;
}
kernel void kernel_scale_4(
device const float4 * src0,
device float4 * dst,
constant float & scale,
constant float & scale,
uint tpig[[thread_position_in_grid]]) {
dst[tpig] = src0[tpig] * scale;
}

View file

@ -1497,47 +1497,46 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
size_t x_offset = 0;
int64_t pi02 = -1;
int64_t pi03 = -1;
for (int64_t i13 = 0; i13 < ne13; i13++) {
int64_t i03 = i13 / r3;
for (int64_t i03 = 0; i03 < ne03; i03++) {
// TODO: copy src0 here when r3>1
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
if (src0->backend == GGML_BACKEND_GPU) {
x_offset = (i03 * ne02 + i02) * x_ne;
} else {
// copy src0 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
}
for (int64_t i12 = 0; i12 < ne12; i12++) {
int64_t i02 = i12 / r2;
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
// copy src1 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
// copy data to device
if (src0->backend == GGML_BACKEND_GPU) {
x_offset = (i03 * ne02 + i02) * x_ne;
} else if (i02 != pi02 || i03 != pi03) {
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
pi02 = i02;
pi03 = i03;
CL_CHECK(clFinish(queue));
// compute
cl_event ev_sgemm;
clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
(CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, x_offset, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
&queue, &ev_sgemm);
if (status != clblast::StatusCode::kSuccess) {
printf("\nF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast<int>(status),ne00,ne01,ne10,ne11);
GGML_ASSERT(false);
}
// copy dst to host
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
}
}
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
CL_CHECK(clFinish(queue));
// compute
cl_event ev_sgemm;
clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
(CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, x_offset, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
&queue, &ev_sgemm);
if (status != clblast::StatusCode::kSuccess) {
printf("\nF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast<int>(status),ne00,ne01,ne10,ne11);
GGML_ASSERT(false);
}
// copy dst to host
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
}
}
@ -1598,74 +1597,71 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
size_t x_offset = 0;
int64_t pi02 = -1;
int64_t pi03 = -1;
for (int64_t i13 = 0; i13 < ne13; i13++) {
int64_t i03 = i13 / r3;
for (int64_t i12 = 0; i12 < ne12; i12++) {
int64_t i02 = i12 / r2;
// copy src0 to device
if (src0->backend == GGML_BACKEND_GPU) {
x_offset = (i03 * ne02 + i02) * x_ne;
} else if (i02 != pi02 || i03 != pi03) {
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
pi02 = i02;
pi03 = i03;
}
// convert src1 to fp16
// TODO: use multiple threads
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
if (src1_cont_rows) {
if (src1_cont_cols) {
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
for (int64_t i03 = 0; i03 < ne03; i03++) {
// TODO: copy src0 here when r3>1
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
if (src0->backend == GGML_BACKEND_GPU) {
x_offset = (i03 * ne02 + i02) * x_ne;
} else {
// copy src0 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
}
else {
for (int64_t i11 = 0; i11 < ne11; i11++) {
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
// convert src1 to fp16
// TODO: use multiple threads
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
if (src1_cont_rows) {
if (src1_cont_cols) {
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
}
else {
for (int64_t i11 = 0; i11 < ne11; i11++) {
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
}
}
}
}
}
else {
for (int64_t i11 = 0; i11 < ne11; i11++) {
for (int64_t i10 = 0; i10 < ne10; i10++) {
// very slow due to no inlining
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
else {
for (int64_t i11 = 0; i11 < ne11; i11++) {
for (int64_t i10 = 0; i10 < ne10; i10++) {
// very slow due to no inlining
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
}
}
}
// copy src1 to device
CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
CL_CHECK(clFinish(queue));
// compute
cl_event ev_sgemm;
clblast::StatusCode status = (clblast::StatusCode)CLBlastHgemm((CLBlastLayout)clblast::Layout::kColMajor,
(CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, x_offset, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
&queue, &ev_sgemm);
if (status != clblast::StatusCode::kSuccess) {
printf("\nF16 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast<int>(status),ne00,ne01,ne10,ne11);
GGML_ASSERT(false);
}
// copy dst to host, then convert to float
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
ggml_fp16_to_fp32_row(tmp, d, d_ne);
}
}
// copy src1 to device
CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL));
CL_CHECK(clFinish(queue));
// compute
cl_event ev_sgemm;
clblast::StatusCode status = (clblast::StatusCode)CLBlastHgemm((CLBlastLayout)clblast::Layout::kColMajor,
(CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, x_offset, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
&queue, &ev_sgemm);
if (status != clblast::StatusCode::kSuccess) {
printf("\nF16 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast<int>(status),ne00,ne01,ne10,ne11);
GGML_ASSERT(false);
}
// copy dst to host, then convert to float
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
ggml_fp16_to_fp32_row(tmp, d, d_ne);
}
}
@ -1728,86 +1724,82 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
size_t ev_idx = 0;
std::vector<cl_event> events;
int64_t pi02 = -1;
int64_t pi03 = -1;
for (int64_t i13 = 0; i13 < ne13; i13++) {
int64_t i03 = i13 / r3;
for (int64_t i12 = 0; i12 < ne12; i12++) {
int64_t i02 = i12 / r2;
// copy src0 to device if necessary
if (src0->backend == GGML_BACKEND_CPU) {
if (i02 != pi02 || i03 != pi03) {
for (int64_t i03 = 0; i03 < ne03; i03++) {
// TODO: copy and dequantize src0 here when r3>1
for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
// copy src0 to device if necessary
if (src0->backend == GGML_BACKEND_CPU) {
events.emplace_back();
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
pi02 = i02;
pi03 = i03;
}
} else if (src0->backend == GGML_BACKEND_GPU) {
d_Q = (cl_mem) src0->extra;
} else {
GGML_ASSERT(false);
}
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
// copy src1 to device
events.emplace_back();
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
// compute
const size_t global = ne01 * local;
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
const cl_int ncols = ne00;
events.emplace_back();
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
} else { // general dequantization kernel + CLBlast matrix matrix multiplication
// convert src0 to fp32 on device
const size_t global = x_ne / global_denom;
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, offset > 0 ? &offset : NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
// copy src1 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
events.emplace_back();
// wait for conversion
CL_CHECK(clFinish(queue));
// compute
clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
(CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, 0, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
&queue, events.data() + ev_idx++);
if (status != clblast::StatusCode::kSuccess) {
printf("\nQF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast<int>(status),ne00,ne01,ne10,ne11);
} else if (src0->backend == GGML_BACKEND_GPU) {
d_Q = (cl_mem) src0->extra;
} else {
GGML_ASSERT(false);
}
}
// copy dst to host
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
for (auto *event : events) {
clReleaseEvent(event);
}
if (!mul_mat_vec) {
// convert src0 to fp32 on device
const size_t global = x_ne / global_denom;
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q));
CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X));
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
}
ev_idx = 0;
events.clear();
for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
// copy src1 to device
events.emplace_back();
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
// compute
const size_t global = ne01 * local;
const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0;
const cl_int ncols = ne00;
events.emplace_back();
CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q));
CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL));
CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y));
CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D));
CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols));
CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++));
} else { // CLBlast matrix matrix multiplication
// copy src1 to device
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
// wait for conversion
CL_CHECK(clFinish(queue));
// compute
events.emplace_back();
clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
(CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
ne01, ne11, ne10,
alpha,
d_X, 0, ne00,
d_Y, 0, ne10,
beta,
d_D, 0, ne01,
&queue, events.data() + ev_idx++);
if (status != clblast::StatusCode::kSuccess) {
printf("\nQF32 Matmul Failed (%d): [dims: %ld,%ld,%ld,%ld] You may be out of VRAM. Please check if you have enough.\n",static_cast<int>(status),ne00,ne01,ne10,ne11);
GGML_ASSERT(false);
}
}
// copy dst to host
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
for (auto *event : events) {
clReleaseEvent(event);
}
ev_idx = 0;
events.clear();
}
}
}
}
@ -1823,22 +1815,13 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
const int64_t ne02 = src0->ne[2];
const int64_t ne03 = src0->ne[3];
const int64_t ne10 = src1->ne[0];
const int64_t ne12 = src1->ne[2];
const int64_t ne13 = src1->ne[3];
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
// ref: https://github.com/ggerganov/ggml/pull/224
// TODO: find the optimal values for these
if (ne02 == ne12 && ne03 == ne13 &&
(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 &&
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) {
@ -1924,4 +1907,4 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
tensor->extra = dst;
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
}
}

464
ggml.c
View file

@ -571,7 +571,6 @@ int64_t ggml_cycles_per_ms(void) {
#define ggml_perf_cycles_per_ms() 0
#endif
//
// cache line
//
@ -1829,7 +1828,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
return type_traits[type];
}
//
// simd mappings
//
@ -4058,16 +4056,17 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"ALIBI",
"CLAMP",
"CONV_1D",
"CONV_1D_STAGE_0",
"CONV_1D_STAGE_1",
"CONV_TRANSPOSE_1D",
"CONV_2D",
"CONV_2D_STAGE_0",
"CONV_2D_STAGE_1",
"CONV_TRANSPOSE_2D",
"POOL_1D",
"POOL_2D",
"UPSCALE",
"CONV_1D_STAGE_0",
"CONV_1D_STAGE_1",
"FLASH_ATTN",
"FLASH_FF",
"FLASH_ATTN_BACK",
@ -4093,7 +4092,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK",
};
static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@ -4144,16 +4143,17 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"alibi(x)",
"clamp(x)",
"conv_1d(x)",
"conv_1d_stage_0(x)",
"conv_1d_stage_1(x)",
"conv_transpose_1d(x)",
"conv_2d(x)",
"conv_2d_stage_0(x)",
"conv_2d_stage_1(x)",
"conv_transpose_2d(x)",
"pool_1d(x)",
"pool_2d(x)",
"upscale(x)",
"conv_1d_stage_0(x)",
"conv_1d_stage_1(x)",
"flash_attn(x)",
"flash_ff(x)",
"flash_attn_back(x)",
@ -4179,7 +4179,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)",
};
static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -4210,8 +4210,10 @@ static void ggml_setup_op_has_task_pass(void) {
p[GGML_OP_CONV_1D ] = true;
p[GGML_OP_CONV_1D_STAGE_0 ] = true;
p[GGML_OP_CONV_1D_STAGE_1 ] = true;
p[GGML_OP_CONV_2D ] = true;
p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
p[GGML_OP_CONV_2D ] = true;
p[GGML_OP_CONV_2D_STAGE_0 ] = true;
p[GGML_OP_CONV_2D_STAGE_1 ] = true;
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
p[GGML_OP_FLASH_ATTN_BACK ] = true;
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
@ -5955,7 +5957,6 @@ struct ggml_tensor * ggml_sqrt_inplace(
return ggml_sqrt_impl(ctx, a, true);
}
// ggml_log
static struct ggml_tensor * ggml_log_impl(
@ -6009,7 +6010,6 @@ struct ggml_tensor * ggml_sum(
return result;
}
// ggml_sum_rows
struct ggml_tensor * ggml_sum_rows(
@ -6641,7 +6641,6 @@ struct ggml_tensor * ggml_set_2d_inplace(
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
}
// ggml_cpy
static struct ggml_tensor * ggml_cpy_impl(
@ -6721,7 +6720,6 @@ struct ggml_tensor * ggml_cont_inplace(
return ggml_cont_impl(ctx, a, true);
}
// make contiguous, with new shape
GGML_API struct ggml_tensor * ggml_cont_1d(
struct ggml_context * ctx,
@ -7174,7 +7172,6 @@ struct ggml_tensor * ggml_diag(
return result;
}
// ggml_diag_mask_inf
static struct ggml_tensor * ggml_diag_mask_inf_impl(
@ -7286,7 +7283,6 @@ struct ggml_tensor * ggml_soft_max_inplace(
return ggml_soft_max_impl(ctx, a, true);
}
// ggml_soft_max_back
static struct ggml_tensor * ggml_soft_max_back_impl(
@ -7703,7 +7699,11 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
// ggml_conv_2d
struct ggml_tensor * ggml_conv_2d(
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
// a: [OCIC, KH, KW]
// b: [N, IC, IH, IW]
// result: [N, OH, OW, IC*KH*KW]
static struct ggml_tensor * ggml_conv_2d_stage_0(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
@ -7722,17 +7722,21 @@ struct ggml_tensor * ggml_conv_2d(
is_node = true;
}
const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
const int64_t ne[4] = {
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
a->ne[3], b->ne[3],
a->ne[2] * a->ne[1] * a->ne[0],
OW,
OH,
b->ne[3],
};
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
int32_t params[] = { s0, s1, p0, p1, d0, d1 };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_CONV_2D;
result->op = GGML_OP_CONV_2D_STAGE_0;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = b;
@ -7741,8 +7745,61 @@ struct ggml_tensor * ggml_conv_2d(
}
// ggml_conv_2d_sk_p0
// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
// a: [OC, IC, KH, KW]
// b: [N, OH, OW, IC * KH * KW]
// result: [N, OC, OH, OW]
static struct ggml_tensor * ggml_conv_2d_stage_1(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b) {
bool is_node = false;
if (a->grad || b->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
const int64_t ne[4] = {
b->ne[1],
b->ne[2],
a->ne[3],
b->ne[3],
};
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
result->op = GGML_OP_CONV_2D_STAGE_1;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = b;
return result;
}
// a: [OCIC, KH, KW]
// b: [N, IC, IH, IW]
// result: [N, OC, OH, OW]
struct ggml_tensor * ggml_conv_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1) {
struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
result = ggml_conv_2d_stage_1(ctx, a, result);
return result;
}
// ggml_conv_2d_sk_p0
struct ggml_tensor * ggml_conv_2d_sk_p0(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -8181,7 +8238,6 @@ static struct ggml_tensor * ggml_add_rel_pos_impl(
return result;
}
struct ggml_tensor * ggml_add_rel_pos(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -8626,8 +8682,6 @@ struct ggml_tensor * ggml_map_custom3_inplace(
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
}
// ggml_cross_entropy_loss
struct ggml_tensor * ggml_cross_entropy_loss(
@ -9829,7 +9883,6 @@ static void ggml_compute_forward_add1(
}
}
// ggml_compute_forward_acc
static void ggml_compute_forward_acc_f32(
@ -9969,7 +10022,6 @@ static void ggml_compute_forward_sub_f32(
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
#ifdef GGML_USE_ACCELERATE
vDSP_vsub(
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
@ -10150,7 +10202,6 @@ static void ggml_compute_forward_div_f32(
const int i2 = (ir - i3*ne2*ne1)/ne1;
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
#ifdef GGML_USE_ACCELERATE
UNUSED(ggml_vec_div_f32);
@ -10288,7 +10339,6 @@ static void ggml_compute_forward_sqrt(
}
}
// ggml_compute_forward_log
static void ggml_compute_forward_log_f32(
@ -12121,7 +12171,6 @@ static void ggml_compute_forward_out_prod_f32(
}
}
//int64_t t1 = ggml_perf_time_us();
//static int64_t acc = 0;
//acc += t1 - t0;
@ -12317,7 +12366,6 @@ static void ggml_compute_forward_scale_f32(
const size_t nb1 = dst->nb[1];
for (int i1 = ir0; i1 < ir1; i1++) {
if (dst->data != src0->data) {
// src0 is same shape as dst => same indices
@ -12715,7 +12763,6 @@ static void ggml_compute_forward_get_rows_back_f32(
}
}
static void ggml_compute_forward_get_rows_back(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
@ -13538,7 +13585,7 @@ static void ggml_compute_forward_rope_f16(
dst_data[n_dims] = GGML_FP32_TO_FP16(x2*cos_block_theta - x3*sin_block_theta);
dst_data[n_dims/2*3] = GGML_FP32_TO_FP16(x2*sin_block_theta + x3*cos_block_theta);
}
} if (!is_neox) {
} else if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta);
const float sin_theta = sinf(theta);
@ -13998,6 +14045,7 @@ static void ggml_compute_forward_conv_1d_f32(
}
}
// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
ggml_fp16_t * A,
ggml_fp16_t * B,
@ -14299,6 +14347,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
}
}
// need to zero dst since we are accumulating into it
memset(dst->data, 0, ggml_nbytes(dst));
return;
}
@ -14371,7 +14422,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
float * dst_data = wdata + i01*ne00*ne02;
for (int64_t i00 = 0; i00 < ne00; i00++) {
dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00];
dst_data[i00*ne02 + i02] = src[i00];
}
}
}
@ -14390,6 +14441,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
}
}
// need to zero dst since we are accumulating into it
memset(dst->data, 0, ggml_nbytes(dst));
return;
}
@ -14451,6 +14505,144 @@ static void ggml_compute_forward_conv_transpose_1d(
// ggml_compute_forward_conv_2d
// src0: kernel [OC, IC, KH, KW]
// src1: image [N, IC, IH, IW]
// dst: result [N, OH, OW, IC*KH*KW]
static void ggml_compute_forward_conv_2d_stage_0_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F16);
int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
GGML_TENSOR_BINARY_OP_LOCALS;
const int64_t N = ne13;
const int64_t IC = ne12;
const int64_t IH = ne11;
const int64_t IW = ne10;
// const int64_t OC = ne03;
// const int64_t IC = ne02;
const int64_t KH = ne01;
const int64_t KW = ne00;
const int64_t OH = ne2;
const int64_t OW = ne1;
const int ith = params->ith;
const int nth = params->nth;
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
if (params->type == GGML_TASK_INIT) {
memset(dst->data, 0, ggml_nbytes(dst));
return;
}
if (params->type == GGML_TASK_FINALIZE) {
return;
}
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
{
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
for (int64_t in = 0; in < N; in++) {
for (int64_t ioh = 0; ioh < OH; ioh++) {
for (int64_t iow = 0; iow < OW; iow++) {
for (int64_t iic = ith; iic < IC; iic+=nth) {
// micro kernel
ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
for (int64_t ikh = 0; ikh < KH; ikh++) {
for (int64_t ikw = 0; ikw < KW; ikw++) {
const int64_t iiw = iow*s0 + ikw*d0 - p0;
const int64_t iih = ioh*s1 + ikh*d1 - p1;
if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
}
}
}
}
}
}
}
}
}
// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
// src0: [OC, IC, KH, KW]
// src1: [N, OH, OW, IC * KH * KW]
// result: [N, OC, OH, OW]
static void ggml_compute_forward_conv_2d_stage_1_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
if (params->type == GGML_TASK_INIT) {
return;
}
if (params->type == GGML_TASK_FINALIZE) {
return;
}
GGML_TENSOR_BINARY_OP_LOCALS;
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb0 == sizeof(float));
const int N = ne13;
const int OH = ne12;
const int OW = ne11;
const int OC = ne03;
const int IC = ne02;
const int KH = ne01;
const int KW = ne00;
const int ith = params->ith;
const int nth = params->nth;
int64_t m = OC;
int64_t n = OH * OW;
int64_t k = IC * KH * KW;
// [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
for (int i = 0; i < N; i++) {
ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
float * C = (float *)dst->data + i * m * n; // [m, n]
gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
}
}
static void ggml_compute_forward_conv_2d_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
@ -14463,16 +14655,40 @@ static void ggml_compute_forward_conv_2d_f16_f32(
int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
GGML_TENSOR_BINARY_OP_LOCALS;
GGML_TENSOR_BINARY_OP_LOCALS
// src1: image [N, IC, IH, IW]
// src0: kernel [OC, IC, KH, KW]
// dst: result [N, OC, OH, OW]
// ne12: IC
// ne0: OW
// ne1: OH
// nk0: KW
// nk1: KH
// ne13: N
const int N = ne13;
const int IC = ne12;
const int IH = ne11;
const int IW = ne10;
const int OC = ne03;
// const int IC = ne02;
const int KH = ne01;
const int KW = ne00;
const int OH = ne1;
const int OW = ne0;
const int ith = params->ith;
const int nth = params->nth;
const int nk0 = ne00;
const int nk1 = ne01;
// const int nk0 = ne00;
// const int nk1 = ne01;
// size of the convolution row - the kernel size unrolled across all channels
const int ew0 = nk0*nk1*ne02;
// const int ew0 = nk0*nk1*ne02;
// ew0: IC*KH*KW
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
@ -14488,24 +14704,27 @@ static void ggml_compute_forward_conv_2d_f16_f32(
memset(params->wdata, 0, params->wsize);
// prepare source data (src1)
// im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
{
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
for (int i13 = 0; i13 < ne13; i13++) {
for (int i12 = 0; i12 < ne12; i12++) {
const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
for (int in = 0; in < N; in++) {
for (int iic = 0; iic < IC; iic++) {
for (int ioh = 0; ioh < OH; ioh++) {
for (int iow = 0; iow < OW; iow++) {
for (int i1 = 0; i1 < ne1; i1++) {
for (int i0 = 0; i0 < ne0; i0++) {
for (int ik1 = 0; ik1 < nk1; ik1++) {
for (int ik0 = 0; ik0 < nk0; ik0++) {
const int idx0 = i0*s0 + ik0*d0 - p0;
const int idx1 = i1*s1 + ik1*d1 - p1;
// micro kernel
ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
for (int ikh = 0; ikh < KH; ikh++) {
for (int ikw = 0; ikw < KW; ikw++) {
const int iiw = iow*s0 + ikw*d0 - p0;
const int iih = ioh*s1 + ikh*d1 - p1;
if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
}
}
}
@ -14522,30 +14741,22 @@ static void ggml_compute_forward_conv_2d_f16_f32(
return;
}
// total patches in dst
const int np = ne2;
// patches per thread
const int dp = (np + nth - 1)/nth;
// patch range for this thread
const int ip0 = dp*ith;
const int ip1 = MIN(ip0 + dp, np);
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
// wdata: [N*OH*OW, IC*KH*KW]
// dst: result [N, OC, OH, OW]
// src0: kernel [OC, IC, KH, KW]
for (int i3 = 0; i3 < ne3; i3++) {
for (int i2 = ip0; i2 < ip1; i2++) {
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
int64_t m = OC;
int64_t n = OH * OW;
int64_t k = IC * KH * KW;
for (int i1 = 0; i1 < ne1; ++i1) {
for (int i0 = 0; i0 < ne0; ++i0) {
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
(ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
}
}
}
// [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
for (int i = 0; i < N; i++) {
ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
float * C = (float *)dst->data + i * m * n; // [m * k]
gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
}
}
@ -14571,6 +14782,48 @@ static void ggml_compute_forward_conv_2d(
}
}
static void ggml_compute_forward_conv_2d_stage_0(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F16:
{
ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
} break;
case GGML_TYPE_F32:
{
GGML_ASSERT(false);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
static void ggml_compute_forward_conv_2d_stage_1(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F16:
{
ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
} break;
case GGML_TYPE_F32:
{
GGML_ASSERT(false);
} break;
default:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_conv_transpose_2d
static void ggml_compute_forward_conv_transpose_2d(
@ -14629,6 +14882,8 @@ static void ggml_compute_forward_conv_transpose_2d(
}
}
memset(dst->data, 0, ggml_nbytes(dst));
return;
}
@ -16127,7 +16382,6 @@ static void ggml_compute_forward_add_rel_pos_f32(
const int ip0 = dp*ith;
const int ip1 = MIN(ip0 + dp, np);
for (int64_t i13 = ip0; i13 < ip1; ++i13) {
for (int64_t i12 = 0; i12 < ne12; ++i12) {
for (int64_t i11 = 0; i11 < ne11; ++i11) {
@ -16194,7 +16448,6 @@ static void ggml_compute_forward_map_unary_f32(
}
}
static void ggml_compute_forward_map_unary(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
@ -16242,7 +16495,6 @@ static void ggml_compute_forward_map_binary_f32(
}
}
static void ggml_compute_forward_map_binary(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
@ -16294,7 +16546,6 @@ static void ggml_compute_forward_map_custom2_f32(
fun(dst, a, b);
}
// ggml_compute_forward_map_custom3
static void ggml_compute_forward_map_custom3_f32(
@ -16569,7 +16820,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
ggml_vec_sub_f32(nc, ds0, ds0, s1);
ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
#ifndef NDEBUG
for (int i = 0; i < nc; ++i) {
assert(!isnan(ds0[i]));
@ -16597,12 +16847,15 @@ static void ggml_compute_forward_cross_entropy_loss_back(
}
}
/////////////////////////////////
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
GGML_ASSERT(params);
if (tensor->op == GGML_OP_NONE) {
return;
}
#ifdef GGML_USE_CUBLAS
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
if (skip_cpu) {
@ -16805,6 +17058,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_CONV_2D_STAGE_0:
{
ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_CONV_2D_STAGE_1:
{
ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
@ -17734,11 +17995,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_CONV_TRANSPOSE_1D:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_CONV_2D:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_CONV_TRANSPOSE_1D:
case GGML_OP_CONV_2D_STAGE_0:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_CONV_2D_STAGE_1:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
@ -18663,6 +18932,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
const int64_t ne0 = node->ne[0];
const int64_t ne1 = node->ne[1];
const int64_t ne2 = node->ne[2];
const int64_t ne3 = node->ne[3];
const int64_t nk = ne00*ne01;
const int64_t ew0 = nk * ne02;
@ -18673,7 +18943,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
if (node->src[0]->type == GGML_TYPE_F16 &&
node->src[1]->type == GGML_TYPE_F32) {
cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
// im2col: [N*OH*OW, IC*KH*KW]
cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
} else if (node->src[0]->type == GGML_TYPE_F32 &&
node->src[1]->type == GGML_TYPE_F32) {
cur = sizeof(float)* (ne10*ne11*ne12);
@ -18683,6 +18954,14 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
work_size = MAX(work_size, cur);
} break;
case GGML_OP_CONV_2D_STAGE_0:
{
n_tasks = n_threads;
} break;
case GGML_OP_CONV_2D_STAGE_1:
{
n_tasks = n_threads;
} break;
case GGML_OP_CONV_TRANSPOSE_2D:
{
n_tasks = n_threads;
@ -19167,6 +19446,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
if (idx == -1) {
fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
fclose(fout);
return;
}
@ -19870,7 +20150,6 @@ static enum ggml_opt_result ggml_opt_adam(
opt->loss_after = fx;
// check convergence
if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
GGML_PRINT_DEBUG("converged\n");
@ -20841,7 +21120,7 @@ struct gguf_kv {
};
struct gguf_header {
uint32_t magic;
char magic[4];
uint32_t version;
uint64_t n_tensors; // GGUFv2
uint64_t n_kv; // GGUFv2
@ -20911,7 +21190,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
struct gguf_context * gguf_init_empty(void) {
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
ctx->header.magic = GGUF_MAGIC;
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
ctx->header.version = GGUF_VERSION;
ctx->header.n_tensors = 0;
ctx->header.n_kv = 0;
@ -20937,16 +21216,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// offset from start of file
size_t offset = 0;
uint32_t magic = 0;
char magic[4];
// check the magic before making allocations
{
gguf_fread_el(file, &magic, sizeof(magic), &offset);
if (magic != GGUF_MAGIC) {
fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
fclose(file);
return NULL;
for (uint32_t i = 0; i < sizeof(magic); i++) {
if (magic[i] != GGUF_MAGIC[i]) {
fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
fclose(file);
return NULL;
}
}
}
@ -20956,7 +21237,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// read the header
{
ctx->header.magic = magic;
strncpy(ctx->header.magic, magic, 4);
ctx->kv = NULL;
ctx->infos = NULL;

20
ggml.h
View file

@ -231,8 +231,9 @@
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
#define GGUF_MAGIC 0x46554747 // "GGUF"
#define GGUF_VERSION 2
#define GGUF_MAGIC "GGUF"
#define GGUF_VERSION 3
#define GGUF_DEFAULT_ALIGNMENT 32
@ -407,15 +408,16 @@ extern "C" {
GGML_OP_ALIBI,
GGML_OP_CLAMP,
GGML_OP_CONV_1D,
GGML_OP_CONV_2D,
GGML_OP_CONV_1D_STAGE_0, // internal
GGML_OP_CONV_1D_STAGE_1, // internal
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_CONV_2D,
GGML_OP_CONV_2D_STAGE_0, // internal
GGML_OP_CONV_2D_STAGE_1, // internal
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
GGML_OP_CONV_1D_STAGE_0, // internal
GGML_OP_CONV_1D_STAGE_1, // internal
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_FLASH_ATTN,
@ -1026,9 +1028,9 @@ extern "C" {
struct ggml_tensor * b,
float eps);
// A: n columns, m rows
// B: n columns, p rows (i.e. we transpose it internally)
// result is m columns, p rows
// A: k columns, n rows => [ne03, ne02, n, k]
// B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
// result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
GGML_API struct ggml_tensor * ggml_mul_mat(
struct ggml_context * ctx,
struct ggml_tensor * a,

View file

@ -19,9 +19,10 @@ import numpy as np
#
GGUF_MAGIC = 0x46554747
GGUF_VERSION = 2
GGUF_VERSION = 3
GGUF_DEFAULT_ALIGNMENT = 32
# general
KEY_GENERAL_ARCHITECTURE = "general.architecture"
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
@ -597,6 +598,10 @@ class GGMLQuantizationType(IntEnum):
Q6_K = 14
Q8_K = 15
class GGUFEndian(IntEnum):
LITTLE = 0
BIG = 1
class GGUFValueType(IntEnum):
UINT8 = 0
@ -644,18 +649,41 @@ class GGUFWriter:
temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
tensors: list[tuple[np.ndarray[Any, Any], int]]
def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
@property
def pack_prefix(self):
if self.endianess==GGUFEndian.LITTLE:
return "<"
else:
return ">"
def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
self.fout = open(path, "wb")
self.arch = arch
self.endianess = endianess
self._simple_value_packing = {
GGUFValueType.UINT8: f"{self.pack_prefix}B",
GGUFValueType.INT8: f"{self.pack_prefix}b",
GGUFValueType.UINT16: f"{self.pack_prefix}H",
GGUFValueType.INT16: f"{self.pack_prefix}h",
GGUFValueType.UINT32: f"{self.pack_prefix}I",
GGUFValueType.INT32: f"{self.pack_prefix}i",
GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
GGUFValueType.UINT64: f"{self.pack_prefix}Q",
GGUFValueType.INT64: f"{self.pack_prefix}q",
GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
GGUFValueType.BOOL: "?" ,
}
self.add_architecture()
self.use_temp_file = use_temp_file
self.tensors = []
endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
print(f"This gguf file is for {endianess_str} only")
def write_header_to_file(self):
self.fout.write(struct.pack("<I", GGUF_MAGIC))
self.fout.write(struct.pack("<I", GGUF_VERSION))
self.fout.write(struct.pack("<Q", self.ti_data_count))
self.fout.write(struct.pack("<Q", self.kv_data_count))
self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
self.flush()
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
@ -727,25 +755,12 @@ class GGUFWriter:
self.add_key(key)
self.add_val(val, GGUFValueType.ARRAY)
_simple_value_packing = {
GGUFValueType.UINT8: "<B",
GGUFValueType.INT8: "<b",
GGUFValueType.UINT16: "<H",
GGUFValueType.INT16: "<h",
GGUFValueType.UINT32: "<I",
GGUFValueType.INT32: "<i",
GGUFValueType.FLOAT32: "<f",
GGUFValueType.UINT64: "<Q",
GGUFValueType.INT64: "<q",
GGUFValueType.FLOAT64: "<d",
GGUFValueType.BOOL: "?" ,
}
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
if vtype is None:
vtype = GGUFValueType.get_type(val)
if add_vtype:
self.kv_data += struct.pack("<I", vtype)
self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
self.kv_data_count += 1
pack_fmt = self._simple_value_packing.get(vtype)
@ -753,14 +768,14 @@ class GGUFWriter:
self.kv_data += struct.pack(pack_fmt, val)
elif vtype == GGUFValueType.STRING:
encoded_val = val.encode("utf8") if isinstance(val, str) else val
self.kv_data += struct.pack("<Q", len(encoded_val))
self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
self.kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
ltype = GGUFValueType.get_type(val[0])
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
raise ValueError("All items in a GGUF array should be of the same type")
self.kv_data += struct.pack("<I", ltype)
self.kv_data += struct.pack("<Q", len(val))
self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
for item in val:
self.add_val(item, add_vtype=False)
else:
@ -774,22 +789,24 @@ class GGUFWriter:
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
encoded_name = name.encode("utf8")
self.ti_data += struct.pack("<Q", len(encoded_name))
self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
self.ti_data += encoded_name
n_dims = len(tensor_shape)
self.ti_data += struct.pack("<I", n_dims)
self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
for i in range(n_dims):
self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
if raw_dtype is None:
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
else:
dtype = raw_dtype
self.ti_data += struct.pack("<I", dtype)
self.ti_data += struct.pack("<Q", self.offset_tensor)
self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
self.ti_data_count += 1
def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
if self.endianess == GGUFEndian.BIG:
tensor.byteswap(inplace=True)
if self.use_temp_file and self.temp_file is None:
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
fp.seek(0)
@ -815,6 +832,8 @@ class GGUFWriter:
fp.write(bytes([0] * pad))
def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
if self.endianess==GGUFEndian.BIG:
tensor.byteswap(inplace=True)
self.write_padding(self.fout, self.fout.tell())
tensor.tofile(self.fout)
self.write_padding(self.fout, tensor.nbytes)
@ -968,12 +987,15 @@ class SpecialVocab:
merges: list[str] = []
special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
special_token_ids: dict[str, int] = {}
n_vocab: int | None = None
def __init__(
self, path: str | os.PathLike[str], load_merges: bool = False,
special_token_types: tuple[str, ...] | None = None,
n_vocab: int | None = None,
):
self.special_token_ids = {}
self.n_vocab = n_vocab
self.load_merges = load_merges
if special_token_types is not None:
self.special_token_types = special_token_types
@ -983,6 +1005,16 @@ class SpecialVocab:
if not self._try_load_from_tokenizer_json(path):
self._try_load_from_config_json(path)
def _set_special_token(self, typ: str, tid: Any):
if not isinstance(tid, int) or tid < 0:
return
if self.n_vocab is None or tid < self.n_vocab:
self.special_token_ids[typ] = tid
return
print(f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
file = sys.stderr)
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer_file = path / 'tokenizer.json'
if not tokenizer_file.is_file():
@ -1010,10 +1042,11 @@ class SpecialVocab:
tc_content = entry_content
else:
continue
for maybe_token_id in (atok.get('id') for atok in added_tokens if atok.get('content') == tc_content):
if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
self.special_token_ids[typ] = maybe_token_id
break
# We only need the first match here.
maybe_token_id = next((
atok.get('id') for atok in added_tokens
if atok.get('content') == tc_content), None)
self._set_special_token(typ, maybe_token_id)
return True
def _try_load_from_config_json(self, path: Path) -> bool:
@ -1023,21 +1056,21 @@ class SpecialVocab:
with open(config_file, encoding = 'utf-8') as f:
config = json.load(f)
for typ in self.special_token_types:
maybe_token_id = config.get(f'{typ}_token_id')
if isinstance(maybe_token_id, int) and maybe_token_id >= 0:
self.special_token_ids[typ] = maybe_token_id
self._set_special_token(typ, config.get(f'{typ}_token_id'))
return True
def add_to_gguf(self, gw: GGUFWriter) -> None:
def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
if len(self.merges) > 0:
print(f'gguf: Adding {len(self.merges)} merge(s).')
if not quiet:
print(f'gguf: Adding {len(self.merges)} merge(s).')
gw.add_token_merges(self.merges)
for typ, tokid in self.special_token_ids.items():
handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
if handler is None:
print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping')
print(f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', file = sys.stderr)
continue
print(f'gguf: Setting special token type {typ} to {tokid}')
if not quiet:
print(f'gguf: Setting special token type {typ} to {tokid}')
handler(tokid)
def __repr__(self) -> str:

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
version = "0.4.4"
version = "0.4.5"
description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [

View file

@ -176,7 +176,7 @@ static int GetEosID(FileFormat file_format, int32_t n_vocab)
{
if(file_format == FileFormat::GGUF_LLAMA || file_format==FileFormat::GGUF_FALCON)
{
eosID = llama_token_eos(llama_ctx_v4);
eosID = llama_token_eos(&(llama_ctx_v4->model));
}
else if(file_format == FileFormat::GGJT_3)
{
@ -369,9 +369,35 @@ void sample_top_a(llama_token_data_array * candidates, float a, size_t min_keep)
void sample_rep_pen(int n_ctx, int rep_pen_range, float rep_pen, llama_token_data_array * candidates_p)
{
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), rep_pen_range), n_ctx);
llama_sample_repetition_penalty(nullptr, candidates_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, rep_pen);
const llama_token * last_tokens = last_n_tokens.data() + last_n_tokens.size() - last_n_repeat;
size_t last_tokens_size = last_n_repeat;
llama_token_data_array * candidates = candidates_p;
float penalty = rep_pen;
if (last_tokens_size == 0 || penalty == 1.0f) {
return;
}
const int64_t t_start_sample_us = ggml_time_us();
for (size_t i = 0; i < candidates->size; ++i) {
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
if (token_iter == last_tokens + last_tokens_size) {
continue;
}
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
if (candidates->data[i].logit <= 0) {
candidates->data[i].logit *= penalty;
} else {
candidates->data[i].logit /= penalty;
}
}
candidates->sorted = false;
}
void sample_temperature(llama_token_data_array * candidates_p, float temp)

View file

@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
#if !defined(__riscv)
#if !defined(__riscv) && !defined(__s390__)
#include <immintrin.h>
#endif
#endif

View file

@ -365,7 +365,7 @@ maxhordelen = 256
modelbusy = threading.Lock()
requestsinqueue = 0
defaultport = 5001
KcppVersion = "1.47.2"
KcppVersion = "1.48"
showdebug = True
showsamplerwarning = True
showmaxctxwarning = True

200
llama.cpp
View file

@ -987,7 +987,8 @@ static std::string llama_token_to_str(const struct llama_context * ctx, llama_to
result.resize(-n_tokens);
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
GGML_ASSERT(check == -n_tokens);
} else {
}
else {
result.resize(n_tokens);
}
@ -1023,8 +1024,8 @@ enum e_model {
};
static const size_t kB = 1024;
static const size_t MB = kB*kB;
static const size_t GB = kB*kB*kB;
static const size_t MB = 1024*kB;
static const size_t GB = 1024*MB;
struct llama_hparams {
bool vocab_only;
@ -1047,21 +1048,21 @@ struct llama_hparams {
float f_max_alibi_bias;
bool operator!=(const llama_hparams & other) const {
if (this->vocab_only != other.vocab_only) return true;
if (this->n_vocab != other.n_vocab) return true;
if (this->vocab_only != other.vocab_only) return true;
if (this->n_vocab != other.n_vocab) return true;
if (this->n_ctx_train != other.n_ctx_train) return true;
if (this->n_embd != other.n_embd) return true;
if (this->n_head != other.n_head) return true;
if (this->n_head_kv != other.n_head_kv) return true;
if (this->n_layer != other.n_layer) return true;
if (this->n_rot != other.n_rot) return true;
if (this->n_ff != other.n_ff) return true;
if (this->n_embd != other.n_embd) return true;
if (this->n_head != other.n_head) return true;
if (this->n_head_kv != other.n_head_kv) return true;
if (this->n_layer != other.n_layer) return true;
if (this->n_rot != other.n_rot) return true;
if (this->n_ff != other.n_ff) return true;
const float EPSILON = 1e-9;
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true;
if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true;
if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true;
if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true;
return false;
@ -1200,17 +1201,17 @@ struct llama_vocab {
id special_sep_id = -1;
id special_pad_id = -1;
id linefeed_id = 13;
id linefeed_id = 13;
id special_prefix_id = 32007;
id special_middle_id = 32009;
id special_suffix_id = 32008;
id special_eot_id = 32010;
id special_eot_id = 32010;
int find_bpe_rank(std::string token_left, std::string token_right) const {
replace_all(token_left, " ", "\u0120");
replace_all(token_left, "\n", "\u010A");
replace_all(token_right, " ", "\u0120");
replace_all(token_right, "\n", "\u010A");
GGML_ASSERT(token_left.find(" ") == std::string::npos);
GGML_ASSERT(token_left.find("\n") == std::string::npos);
GGML_ASSERT(token_right.find(" ") == std::string::npos);
GGML_ASSERT(token_right.find("\n") == std::string::npos);
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
if (it == bpe_ranks.end()) {
@ -1364,10 +1365,7 @@ static bool llama_kv_cache_init(
cache.cells.clear();
cache.cells.resize(n_ctx);
// TODO: this should be:
// cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
// change it and test that it works
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead());
memset(cache.buf.data, 0, cache.buf.size);
struct ggml_init_params params;
@ -2266,15 +2264,35 @@ static void llm_load_vocab(
if (vocab.type == LLAMA_VOCAB_TYPE_SPM) {
vocab.linefeed_id = llama_byte_to_token(vocab, '\n');
} else {
vocab.linefeed_id = llama_tokenize_internal(vocab, "\u010A", false)[0];
const std::vector<int> ids = llama_tokenize_internal(vocab, "\u010A", false);
GGML_ASSERT(!ids.empty() && "model vocab missing newline token");
vocab.linefeed_id = ids[0];
}
// special tokens
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
GGUF_GET_KEY(ctx, vocab.special_eos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_EOS_ID));
GGUF_GET_KEY(ctx, vocab.special_unk_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_UNK_ID));
GGUF_GET_KEY(ctx, vocab.special_sep_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_SEP_ID));
GGUF_GET_KEY(ctx, vocab.special_pad_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_PAD_ID));
{
const std::vector<std::pair<enum llm_kv, int32_t &>> special_token_types = {
{ LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id },
{ LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id },
{ LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id },
{ LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id },
{ LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id },
};
for (const auto & it : special_token_types) {
const std::string & key = kv(std::get<0>(it));
int32_t & id = std::get<1>(it), old_id = id;
GGUF_GET_KEY(ctx, id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, key);
// Must be >= -1 and < vocab size. Since the key is unsigned, -1
// can only come from the default value, so there's no point in
// validating that.
if (size_t(id + 1) > vocab.id_to_token.size()) {
LLAMA_LOG_WARN("%s: bad special token: '%s' = %d, using default id %d\n",
__func__, key.c_str(), id, old_id);
id = old_id;
}
}
}
// build special tokens cache
{
@ -6131,11 +6149,10 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
}
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
static const char * hex = "0123456789ABCDEF";
switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM: {
char buf[7];
int result = snprintf(buf, sizeof(buf), "<0x%02X>", ch);
GGML_ASSERT(0 <= result && result < 7);
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
return vocab.token_to_id.at(buf);
}
case LLAMA_VOCAB_TYPE_BPE: {
@ -6568,7 +6585,6 @@ struct llm_tokenizer_bpe {
llm_symbol sym;
size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
sym.text = word.c_str() + offset;
sym.n = 1;
sym.n = char_len;
offset += sym.n;
sym.prev = index - 1;
@ -7307,7 +7323,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
std::vector<llama_grammar_candidate> rejects;
if (stack.empty()) {
for (auto tok : candidates) {
for (const auto & tok : candidates) {
if (*tok.code_points != 0 || tok.partial_utf8.n_remain != 0) {
rejects.push_back(tok);
}
@ -7318,7 +7334,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
const llama_grammar_element * stack_pos = stack.back();
std::vector<llama_grammar_candidate> next_candidates;
for (auto tok : candidates) {
for (const auto & tok : candidates) {
if (*tok.code_points == 0) {
// reached end of full codepoints in token, reject iff it ended in a partial sequence
// that cannot satisfy this position in grammar
@ -7344,7 +7360,7 @@ static std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_
llama_grammar_advance_stack(rules, stack_after, next_stacks);
auto next_rejects = llama_grammar_reject_candidates(rules, next_stacks, next_candidates);
for (auto tok : next_rejects) {
for (const auto & tok : next_rejects) {
rejects.push_back({ tok.index, tok.code_points - 1, tok.partial_utf8 });
}
@ -7674,37 +7690,15 @@ void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array
llama_sample_temp(ctx, candidates_p, temp);
}
void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty) {
if (last_tokens_size == 0 || penalty == 1.0f) {
return;
}
const int64_t t_start_sample_us = ggml_time_us();
for (size_t i = 0; i < candidates->size; ++i) {
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
if (token_iter == last_tokens + last_tokens_size) {
continue;
}
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
if (candidates->data[i].logit <= 0) {
candidates->data[i].logit *= penalty;
} else {
candidates->data[i].logit /= penalty;
}
}
candidates->sorted = false;
if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
}
void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, float alpha_frequency, float alpha_presence) {
if (last_tokens_size == 0 || (alpha_frequency == 0.0f && alpha_presence == 0.0f)) {
void llama_sample_repetition_penalties(
struct llama_context * ctx,
llama_token_data_array * candidates,
const llama_token * last_tokens,
size_t penalty_last_n,
float penalty_repeat,
float penalty_freq,
float penalty_present) {
if (penalty_last_n == 0 || (penalty_repeat == 1.0f && penalty_freq == 0.0f && penalty_present == 0.0f)) {
return;
}
@ -7712,19 +7706,28 @@ void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, l
// Create a frequency map to count occurrences of each token in last_tokens
std::unordered_map<llama_token, int> token_count;
for (size_t i = 0; i < last_tokens_size; ++i) {
token_count[last_tokens_p[i]]++;
for (size_t i = 0; i < penalty_last_n; ++i) {
token_count[last_tokens[i]]++;
}
// Apply frequency and presence penalties to the candidates
for (size_t i = 0; i < candidates->size; ++i) {
auto token_iter = token_count.find(candidates->data[i].id);
const auto token_iter = token_count.find(candidates->data[i].id);
if (token_iter == token_count.end()) {
continue;
}
int count = token_iter->second;
candidates->data[i].logit -= float(count) * alpha_frequency + float(count > 0) * alpha_presence;
const int count = token_iter->second;
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
if (candidates->data[i].logit <= 0) {
candidates->data[i].logit *= penalty_repeat;
} else {
candidates->data[i].logit /= penalty_repeat;
}
candidates->data[i].logit -= float(count) * penalty_freq + float(count > 0) * penalty_present;
}
candidates->sorted = false;
@ -7746,7 +7749,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
}
}
const llama_token eos = llama_token_eos(ctx);
const llama_token eos = llama_token_eos(&ctx->model);
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
std::vector<llama_grammar_candidate> candidates_grammar;
@ -7956,7 +7959,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra
void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) {
const int64_t t_start_sample_us = ggml_time_us();
if (token == llama_token_eos(ctx)) {
if (token == llama_token_eos(&ctx->model)) {
for (const auto & stack : grammar->stacks) {
if (stack.empty()) {
return;
@ -8861,8 +8864,10 @@ static int llama_apply_lora_from_file_internal(
throw std::runtime_error(format(
"%s: error: lora failed", __func__));
}
#if defined(GGML_USE_CUBLAS)
offload_func = ggml_cuda_assign_buffers;
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
#endif
}
#endif // GGML_USE_CUBLAS
@ -9166,7 +9171,7 @@ struct llama_context * llama_new_context_with_model(
// build worst-case graph
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
int n_past = cparams.n_ctx - n_tokens;
llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
#ifdef GGML_USE_METAL
@ -9927,43 +9932,44 @@ float * llama_get_embeddings(struct llama_context * ctx) {
return ctx->embedding.data();
}
const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) {
return ctx->model.vocab.id_to_token[token].text.c_str();
const char * llama_token_get_text(const struct llama_model * model, llama_token token) {
return model->vocab.id_to_token[token].text.c_str();
}
float llama_token_get_score(const struct llama_context * ctx, llama_token token) {
return ctx->model.vocab.id_to_token[token].score;
float llama_token_get_score(const struct llama_model * model, llama_token token) {
return model->vocab.id_to_token[token].score;
}
llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) {
return ctx->model.vocab.id_to_token[token].type;
llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) {
return model->vocab.id_to_token[token].type;
}
llama_token llama_token_bos(const struct llama_context * ctx) {
return ctx->model.vocab.special_bos_id;
llama_token llama_token_bos(const struct llama_model * model) {
return model->vocab.special_bos_id;
}
llama_token llama_token_eos(const struct llama_context * ctx) {
return ctx->model.vocab.special_eos_id;
llama_token llama_token_eos(const struct llama_model * model) {
return model->vocab.special_eos_id;
}
llama_token llama_token_nl(const struct llama_context * ctx) {
return ctx->model.vocab.linefeed_id;
}
llama_token llama_token_prefix(const struct llama_context * ctx) {
return ctx->model.vocab.special_prefix_id;
llama_token llama_token_nl(const struct llama_model * model) {
return model->vocab.linefeed_id;
}
llama_token llama_token_middle(const struct llama_context * ctx) {
return ctx->model.vocab.special_middle_id;
llama_token llama_token_prefix(const struct llama_model * model) {
return model->vocab.special_prefix_id;
}
llama_token llama_token_suffix(const struct llama_context * ctx) {
return ctx->model.vocab.special_suffix_id;
llama_token llama_token_middle(const struct llama_model * model) {
return model->vocab.special_middle_id;
}
llama_token llama_token_eot(const struct llama_context * ctx) {
return ctx->model.vocab.special_eot_id;
llama_token llama_token_suffix(const struct llama_model * model) {
return model->vocab.special_suffix_id;
}
llama_token llama_token_eot(const struct llama_model * model) {
return model->vocab.special_eot_id;
}
int llama_tokenize(

37
llama.h
View file

@ -494,21 +494,22 @@ extern "C" {
// Vocab
//
LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token);
LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token);
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence
LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
// codellama infill tokens
LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix
LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle
LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix
LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
//
// Tokenization
@ -560,21 +561,15 @@ extern "C" {
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
LLAMA_API void llama_sample_repetition_penalty(
struct llama_context * ctx,
llama_token_data_array * candidates,
const llama_token * last_tokens,
size_t last_tokens_size,
float penalty);
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
LLAMA_API void llama_sample_frequency_and_presence_penalties(
LLAMA_API void llama_sample_repetition_penalties(
struct llama_context * ctx,
llama_token_data_array * candidates,
const llama_token * last_tokens,
size_t last_tokens_size,
float alpha_frequency,
float alpha_presence);
size_t penalty_last_n,
float penalty_repeat,
float penalty_freq,
float penalty_present);
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.

View file

@ -293,7 +293,7 @@ void print_tok_vec(std::vector<float> &embd)
//try to adapt if the rope_freq_base_train exceeds the auto one
fkey = modelarch+".rope.freq_base";
auto keyidx = gguf_find_key(ctx, fkey.c_str());
keyidx = gguf_find_key(ctx, fkey.c_str());
if (keyidx != -1) {
float fbt = gguf_get_val_f32(ctx, keyidx);
fileformatmeta->freq_base_train = (fbt > 1.0f ? fbt : 0.0f);

View file

@ -3770,8 +3770,10 @@ int llama_v3_apply_lora_from_file_internal(const struct llama_v3_model & model,
throw std::runtime_error(format_old(
"%s: error: lora failed", __func__));
}
#if defined(GGML_USE_CUBLAS)
offload_func = ggml_cuda_assign_buffers;
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
#endif
}
#endif // GGML_USE_CUBLAS

View file

@ -91,9 +91,19 @@ int main(int argc, char **argv) {
}
}
}
// TODO: why doesn't this work for the full range of Unicodes?
// Restrict to assigned unicode planes
// for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
for (uint32_t cp = 0x10000; cp < 0x00080000; ++cp) {
for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_bpe(ctx, tokens);
if (str != check) {
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
return 4;
}
}
for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_bpe(ctx, tokens);
@ -103,7 +113,6 @@ int main(int argc, char **argv) {
return 4;
}
}
llama_free_model(model);
llama_free(ctx);