Merge branch 'master' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # .gitignore # CMakeLists.txt # Makefile # Package.swift # README.md # ggml-cuda.cu # llama.cpp # llama.h # scripts/sync-ggml.sh # tests/CMakeLists.txt
This commit is contained in:
commit
ec21fa7712
34 changed files with 5887 additions and 1435 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -81,6 +81,7 @@ poetry.toml
|
|||
|
||||
# Test binaries
|
||||
tests/test-grammar-parser
|
||||
/tests/test-llama-grammar
|
||||
tests/test-double-float
|
||||
tests/test-grad0
|
||||
tests/test-opt
|
||||
|
@ -92,6 +93,8 @@ tests/test-tokenizer-0-llama
|
|||
tests/test-tokenizer-0-falcon
|
||||
tests/test-tokenizer-1-llama
|
||||
tests/test-tokenizer-1-bpe
|
||||
/tests/test-rope
|
||||
/tests/test-backend-ops
|
||||
|
||||
/koboldcpp_default.so
|
||||
/koboldcpp_failsafe.so
|
||||
|
|
|
@ -279,8 +279,18 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.yarn_beta_slow = std::stof(argv[i]);
|
||||
} else if (arg == "--memory-f32") {
|
||||
params.memory_f16 = false;
|
||||
} else if (arg == "--samplers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.samplers_sequence = parse_samplers_input(argv[i]);
|
||||
} else if (arg == "--sampling-seq") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
sparams.samplers_sequence = argv[i];
|
||||
} else if (arg == "--top-p") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -499,6 +509,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
params.infill = true;
|
||||
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
||||
params.dump_kv_cache = true;
|
||||
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
||||
params.no_kv_offload = true;
|
||||
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
||||
params.cache_type_k = argv[++i];
|
||||
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
||||
params.cache_type_v = argv[++i];
|
||||
} else if (arg == "--multiline-input") {
|
||||
params.multiline_input = true;
|
||||
} else if (arg == "--simple-io") {
|
||||
|
@ -679,6 +695,47 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
std::istreambuf_iterator<char>(),
|
||||
std::back_inserter(sparams.grammar)
|
||||
);
|
||||
} else if (arg == "--override-kv") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
char * sep = strchr(argv[i], '=');
|
||||
if (sep == nullptr || sep - argv[i] >= 128) {
|
||||
fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
struct llama_model_kv_override kvo;
|
||||
std::strncpy(kvo.key, argv[i], sep - argv[i]);
|
||||
kvo.key[sep - argv[i]] = 0;
|
||||
sep++;
|
||||
if (strncmp(sep, "int:", 4) == 0) {
|
||||
sep += 4;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_INT;
|
||||
kvo.int_value = std::atol(sep);
|
||||
} else if (strncmp(sep, "float:", 6) == 0) {
|
||||
sep += 6;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_FLOAT;
|
||||
kvo.float_value = std::atof(sep);
|
||||
} else if (strncmp(sep, "bool:", 5) == 0) {
|
||||
sep += 5;
|
||||
kvo.tag = LLAMA_KV_OVERRIDE_BOOL;
|
||||
if (std::strcmp(sep, "true") == 0) {
|
||||
kvo.bool_value = true;
|
||||
} else if (std::strcmp(sep, "false") == 0) {
|
||||
kvo.bool_value = false;
|
||||
} else {
|
||||
fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.kv_overrides.push_back(kvo);
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
// Parse args for logging parameters
|
||||
} else if ( log_param_single_parse( argv[i] ) ) {
|
||||
|
@ -722,6 +779,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
}
|
||||
}
|
||||
|
||||
if (!params.kv_overrides.empty()) {
|
||||
params.kv_overrides.emplace_back(llama_model_kv_override());
|
||||
params.kv_overrides.back().key[0] = 0;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -762,6 +824,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
|
||||
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
|
||||
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
|
||||
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
|
||||
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
|
||||
|
@ -799,8 +863,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
|
||||
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||
|
@ -841,6 +903,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" --verbose-prompt print prompt before generation\n");
|
||||
printf(" -dkvc, --dump-kv-cache\n");
|
||||
printf(" verbose print of the KV cache\n");
|
||||
printf(" -nkvo, --no-kv-offload\n");
|
||||
printf(" disable KV offload\n");
|
||||
printf(" -ctk TYPE, --cache-type-k TYPE\n");
|
||||
printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
|
||||
printf(" -ctv TYPE, --cache-type-v TYPE\n");
|
||||
printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
|
||||
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||
|
@ -851,6 +919,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
|
||||
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
||||
printf(" path under which to save YAML logs (no logging if unset)\n");
|
||||
printf(" --override-kv KEY=TYPE:VALUE\n");
|
||||
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
|
||||
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
|
||||
printf("\n");
|
||||
#ifndef LOG_DISABLE_LOGS
|
||||
log_print_usage();
|
||||
|
@ -887,6 +958,48 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
|
|||
GGML_UNREACHABLE();
|
||||
}
|
||||
|
||||
//
|
||||
// String parsing
|
||||
//
|
||||
|
||||
std::string parse_samplers_input(std::string input) {
|
||||
std::string output = "";
|
||||
// since samplers names are written multiple ways
|
||||
// make it ready for both system names and input names
|
||||
std::unordered_map<std::string, char> samplers_symbols {
|
||||
{"top_k", 'k'},
|
||||
{"top-k", 'k'},
|
||||
{"top_p", 'p'},
|
||||
{"top-p", 'p'},
|
||||
{"nucleus", 'p'},
|
||||
{"typical_p", 'y'},
|
||||
{"typical-p", 'y'},
|
||||
{"typical", 'y'},
|
||||
{"min_p", 'm'},
|
||||
{"min-p", 'm'},
|
||||
{"tfs_z", 'f'},
|
||||
{"tfs-z", 'f'},
|
||||
{"tfs", 'f'},
|
||||
{"temp", 't'},
|
||||
{"temperature",'t'}
|
||||
};
|
||||
// expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
|
||||
size_t separator = input.find(';');
|
||||
while (separator != input.npos) {
|
||||
std::string name = input.substr(0,separator);
|
||||
input = input.substr(separator+1);
|
||||
separator = input.find(';');
|
||||
|
||||
if (samplers_symbols.find(name) != samplers_symbols.end()) {
|
||||
output += samplers_symbols[name];
|
||||
}
|
||||
}
|
||||
if (samplers_symbols.find(input) != samplers_symbols.end()) {
|
||||
output += samplers_symbols[input];
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
//
|
||||
// Model utils
|
||||
//
|
||||
|
@ -901,10 +1014,39 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
} else {
|
||||
GGML_ASSERT(params.kv_overrides.back().key[0] == 0 && "KV overrides not terminated with empty key");
|
||||
mparams.kv_overrides = params.kv_overrides.data();
|
||||
}
|
||||
|
||||
return mparams;
|
||||
}
|
||||
|
||||
static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||
if (s == "f16") {
|
||||
return GGML_TYPE_F16;
|
||||
}
|
||||
if (s == "q8_0") {
|
||||
return GGML_TYPE_Q8_0;
|
||||
}
|
||||
if (s == "q4_0") {
|
||||
return GGML_TYPE_Q4_0;
|
||||
}
|
||||
if (s == "q4_1") {
|
||||
return GGML_TYPE_Q4_1;
|
||||
}
|
||||
if (s == "q5_0") {
|
||||
return GGML_TYPE_Q5_0;
|
||||
}
|
||||
if (s == "q5_1") {
|
||||
return GGML_TYPE_Q5_1;
|
||||
}
|
||||
|
||||
throw std::runtime_error("Invalid cache type: " + s);
|
||||
}
|
||||
|
||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
||||
auto cparams = llama_context_default_params();
|
||||
|
||||
|
@ -914,7 +1056,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
||||
cparams.mul_mat_q = params.mul_mat_q;
|
||||
cparams.seed = params.seed;
|
||||
cparams.f16_kv = params.memory_f16;
|
||||
cparams.logits_all = params.logits_all;
|
||||
cparams.embedding = params.embedding;
|
||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||
|
@ -925,6 +1066,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
|||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||
cparams.offload_kqv = !params.no_kv_offload;
|
||||
|
||||
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
|
||||
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
|
||||
|
||||
return cparams;
|
||||
}
|
||||
|
@ -1337,7 +1482,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|||
}
|
||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
||||
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
|
||||
fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
|
||||
fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
|
||||
|
|
|
@ -94,6 +94,8 @@ struct gpt_params {
|
|||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
|
||||
// TODO: avoid tuple, use struct
|
||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
@ -106,7 +108,6 @@ struct gpt_params {
|
|||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||
|
||||
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
|
@ -131,10 +132,14 @@ struct gpt_params {
|
|||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
bool infill = false; // use infill mode
|
||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
|
||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||
|
||||
// multimodal models (see examples/llava)
|
||||
std::string mmproj = ""; // path to multimodal projector
|
||||
std::string image = ""; // path to an image file
|
||||
std::string image = ""; // path to an image file
|
||||
};
|
||||
|
||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
|
||||
|
@ -149,6 +154,12 @@ std::string gpt_random_prompt(std::mt19937 & rng);
|
|||
|
||||
void process_escapes(std::string& input);
|
||||
|
||||
//
|
||||
// String parsing
|
||||
//
|
||||
|
||||
std::string parse_samplers_input(std::string input);
|
||||
|
||||
//
|
||||
// Model utils
|
||||
//
|
||||
|
|
|
@ -190,7 +190,7 @@ namespace grammar_parser {
|
|||
pos = parse_space(pos + 1, is_nested);
|
||||
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
|
||||
if (last_sym_start == out_elements.size()) {
|
||||
throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
|
||||
throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
|
||||
}
|
||||
|
||||
// apply transformation to previous symbol (last_sym_start to end) according to
|
||||
|
|
|
@ -99,6 +99,56 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
|
|||
return std::string(result);
|
||||
}
|
||||
|
||||
std::string llama_sampling_order_print(const llama_sampling_params & params) {
|
||||
std::string result = "CFG -> Penalties ";
|
||||
if (params.mirostat == 0) {
|
||||
for (auto s : params.samplers_sequence) {
|
||||
switch (s) {
|
||||
case 'k': result += "-> top_k "; break;
|
||||
case 'f': result += "-> tfs_z "; break;
|
||||
case 'y': result += "-> typical_p "; break;
|
||||
case 'p': result += "-> top_p "; break;
|
||||
case 'm': result += "-> min_p "; break;
|
||||
case 't': result += "-> temp "; break;
|
||||
default : break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
result += "-> mirostat ";
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// no reasons to expose this function in header
|
||||
static void sampler_queue(
|
||||
struct llama_context * ctx_main,
|
||||
const llama_sampling_params & params,
|
||||
llama_token_data_array & cur_p,
|
||||
size_t & min_keep) {
|
||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float min_p = params.min_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
const std::string & samplers_sequence = params.samplers_sequence;
|
||||
|
||||
for (auto s : samplers_sequence) {
|
||||
switch (s){
|
||||
case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
|
||||
case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
|
||||
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
|
||||
default : break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
llama_token llama_sampling_sample(
|
||||
struct llama_sampling_context * ctx_sampling,
|
||||
struct llama_context * ctx_main,
|
||||
|
@ -109,11 +159,6 @@ llama_token llama_sampling_sample(
|
|||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float min_p = params.min_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
|
||||
const float penalty_repeat = params.penalty_repeat;
|
||||
const float penalty_freq = params.penalty_freq;
|
||||
|
@ -188,12 +233,7 @@ llama_token llama_sampling_sample(
|
|||
// temperature sampling
|
||||
size_t min_keep = std::max(1, params.n_probs);
|
||||
|
||||
llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep);
|
||||
llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep);
|
||||
llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep);
|
||||
llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep);
|
||||
llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep);
|
||||
llama_sample_temp (ctx_main, &cur_p, temp);
|
||||
sampler_queue(ctx_main, params, cur_p, min_keep);
|
||||
|
||||
id = llama_sample_token(ctx_main, &cur_p);
|
||||
|
||||
|
|
|
@ -10,22 +10,23 @@
|
|||
|
||||
// sampling parameters
|
||||
typedef struct llama_sampling_params {
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typical_p = 1.00f; // 1.0 = disabled
|
||||
float temp = 0.80f; // 1.0 = disabled
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||
int32_t n_prev = 64; // number of previous tokens to remember
|
||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||
int32_t top_k = 40; // <= 0 to use vocab size
|
||||
float top_p = 0.95f; // 1.0 = disabled
|
||||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typical_p = 1.00f; // 1.0 = disabled
|
||||
float temp = 0.80f; // 1.0 = disabled
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||
std::string samplers_sequence = "kfypmt"; // top_k, tail_free, typical_p, top_p, min_p, temp
|
||||
|
||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||
|
||||
|
@ -80,6 +81,9 @@ std::string llama_sampling_prev_str(llama_sampling_context * ctx_sampling, llama
|
|||
// Print sampling parameters into a string
|
||||
std::string llama_sampling_print(const llama_sampling_params & params);
|
||||
|
||||
// Print sampling order into a string
|
||||
std::string llama_sampling_order_print(const llama_sampling_params & params);
|
||||
|
||||
// this is a common sampling function used across the examples for convenience
|
||||
// it can serve as a starting point for implementing your own sampling function
|
||||
// Note: When using multiple sequences, it is the caller's responsibility to call
|
||||
|
|
|
@ -215,9 +215,10 @@ print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end
|
|||
llama_print_timings(context)
|
||||
|
||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||
let n_tokens = text.count + (add_bos ? 1 : 0)
|
||||
let utf8Count = text.utf8.count
|
||||
let n_tokens = utf8Count + (add_bos ? 1 : 0)
|
||||
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
|
||||
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false)
|
||||
var swiftTokens: [llama_token] = []
|
||||
for i in 0 ..< tokenCount {
|
||||
swiftTokens.append(tokens[Int(i)])
|
||||
|
|
|
@ -54,6 +54,13 @@ static std::vector<T> split(const std::string & str, char delim) {
|
|||
return values;
|
||||
}
|
||||
|
||||
template<typename T, typename F>
|
||||
static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
|
||||
std::vector<std::string> str_values;
|
||||
std::transform(values.begin(), values.end(), std::back_inserter(str_values), f);
|
||||
return str_values;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static T avg(const std::vector<T> & v) {
|
||||
if (v.empty()) {
|
||||
|
@ -127,7 +134,8 @@ struct cmd_params {
|
|||
std::vector<int> n_prompt;
|
||||
std::vector<int> n_gen;
|
||||
std::vector<int> n_batch;
|
||||
std::vector<bool> f32_kv;
|
||||
std::vector<ggml_type> type_k;
|
||||
std::vector<ggml_type> type_v;
|
||||
std::vector<int> n_threads;
|
||||
std::vector<int> n_gpu_layers;
|
||||
std::vector<int> main_gpu;
|
||||
|
@ -143,7 +151,8 @@ static const cmd_params cmd_params_defaults = {
|
|||
/* n_prompt */ {512},
|
||||
/* n_gen */ {128},
|
||||
/* n_batch */ {512},
|
||||
/* f32_kv */ {false},
|
||||
/* type_k */ {GGML_TYPE_F16},
|
||||
/* type_v */ {GGML_TYPE_F16},
|
||||
/* n_threads */ {get_num_physical_cores()},
|
||||
/* n_gpu_layers */ {99},
|
||||
/* main_gpu */ {0},
|
||||
|
@ -163,7 +172,8 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
||||
printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
|
||||
printf(" -ctk <t>, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||
printf(" -ctv <t>, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||
|
@ -174,9 +184,32 @@ static void print_usage(int /* argc */, char ** argv) {
|
|||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||
printf("\n");
|
||||
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
||||
|
||||
}
|
||||
|
||||
static ggml_type ggml_type_from_name(const std::string & s) {
|
||||
if (s == "f16") {
|
||||
return GGML_TYPE_F16;
|
||||
}
|
||||
if (s == "q8_0") {
|
||||
return GGML_TYPE_Q8_0;
|
||||
}
|
||||
if (s == "q4_0") {
|
||||
return GGML_TYPE_Q4_0;
|
||||
}
|
||||
if (s == "q4_1") {
|
||||
return GGML_TYPE_Q4_1;
|
||||
}
|
||||
if (s == "q5_0") {
|
||||
return GGML_TYPE_Q5_0;
|
||||
}
|
||||
if (s == "q5_1") {
|
||||
return GGML_TYPE_Q5_1;
|
||||
}
|
||||
|
||||
return GGML_TYPE_COUNT;
|
||||
}
|
||||
|
||||
|
||||
static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
cmd_params params;
|
||||
std::string arg;
|
||||
|
@ -225,13 +258,38 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
}
|
||||
auto p = split<int>(argv[i], split_delim);
|
||||
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
||||
} else if (arg == "--memory-f32") {
|
||||
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<int>(argv[i], split_delim);
|
||||
params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
|
||||
auto p = split<std::string>(argv[i], split_delim);
|
||||
std::vector<ggml_type> types;
|
||||
for (const auto & t : p) {
|
||||
ggml_type gt = ggml_type_from_name(t);
|
||||
if (gt == GGML_TYPE_COUNT) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
types.push_back(gt);
|
||||
}
|
||||
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
|
||||
} else if (arg == "-ctv" || arg == "--cache-type-v") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = split<std::string>(argv[i], split_delim);
|
||||
std::vector<ggml_type> types;
|
||||
for (const auto & t : p) {
|
||||
ggml_type gt = ggml_type_from_name(t);
|
||||
if (gt == GGML_TYPE_COUNT) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
types.push_back(gt);
|
||||
}
|
||||
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
|
||||
} else if (arg == "-t" || arg == "--threads") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -322,7 +380,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|||
if (params.n_prompt.empty()) { params.n_prompt = cmd_params_defaults.n_prompt; }
|
||||
if (params.n_gen.empty()) { params.n_gen = cmd_params_defaults.n_gen; }
|
||||
if (params.n_batch.empty()) { params.n_batch = cmd_params_defaults.n_batch; }
|
||||
if (params.f32_kv.empty()) { params.f32_kv = cmd_params_defaults.f32_kv; }
|
||||
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
|
||||
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
|
||||
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
|
||||
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
||||
if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
|
||||
|
@ -337,7 +396,8 @@ struct cmd_params_instance {
|
|||
int n_prompt;
|
||||
int n_gen;
|
||||
int n_batch;
|
||||
bool f32_kv;
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
int n_threads;
|
||||
int n_gpu_layers;
|
||||
int main_gpu;
|
||||
|
@ -366,7 +426,8 @@ struct cmd_params_instance {
|
|||
|
||||
cparams.n_ctx = n_prompt + n_gen;
|
||||
cparams.n_batch = n_batch;
|
||||
cparams.f16_kv = !f32_kv;
|
||||
cparams.type_k = type_k;
|
||||
cparams.type_v = type_v;
|
||||
cparams.mul_mat_q = mul_mat_q;
|
||||
|
||||
return cparams;
|
||||
|
@ -381,7 +442,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
|
|||
for (const auto & mg : params.main_gpu)
|
||||
for (const auto & ts : params.tensor_split)
|
||||
for (const auto & nb : params.n_batch)
|
||||
for (const auto & fk : params.f32_kv)
|
||||
for (const auto & tk : params.type_k)
|
||||
for (const auto & tv : params.type_v)
|
||||
for (const auto & mmq : params.mul_mat_q)
|
||||
for (const auto & nt : params.n_threads) {
|
||||
cmd_params_instance instance = {
|
||||
|
@ -389,7 +451,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
|
|||
/* .n_prompt = */ n_prompt,
|
||||
/* .n_gen = */ n_gen,
|
||||
/* .n_batch = */ nb,
|
||||
/* .f32_kv = */ fk,
|
||||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .main_gpu = */ mg,
|
||||
|
@ -411,7 +474,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
for (const auto & mg : params.main_gpu)
|
||||
for (const auto & ts : params.tensor_split)
|
||||
for (const auto & nb : params.n_batch)
|
||||
for (const auto & fk : params.f32_kv)
|
||||
for (const auto & tk : params.type_k)
|
||||
for (const auto & tv : params.type_v)
|
||||
for (const auto & mmq : params.mul_mat_q)
|
||||
for (const auto & nt : params.n_threads) {
|
||||
for (const auto & n_prompt : params.n_prompt) {
|
||||
|
@ -423,7 +487,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
/* .n_prompt = */ n_prompt,
|
||||
/* .n_gen = */ 0,
|
||||
/* .n_batch = */ nb,
|
||||
/* .f32_kv = */ fk,
|
||||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .main_gpu = */ mg,
|
||||
|
@ -442,7 +507,8 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|||
/* .n_prompt = */ 0,
|
||||
/* .n_gen = */ n_gen,
|
||||
/* .n_batch = */ nb,
|
||||
/* .f32_kv = */ fk,
|
||||
/* .type_k = */ tk,
|
||||
/* .type_v = */ tv,
|
||||
/* .n_threads = */ nt,
|
||||
/* .n_gpu_layers = */ nl,
|
||||
/* .main_gpu = */ mg,
|
||||
|
@ -490,7 +556,8 @@ struct test {
|
|||
uint64_t model_n_params;
|
||||
int n_batch;
|
||||
int n_threads;
|
||||
bool f32_kv;
|
||||
ggml_type type_k;
|
||||
ggml_type type_v;
|
||||
int n_gpu_layers;
|
||||
int main_gpu;
|
||||
bool mul_mat_q;
|
||||
|
@ -509,7 +576,8 @@ struct test {
|
|||
model_n_params = llama_model_n_params(lmodel);
|
||||
n_batch = inst.n_batch;
|
||||
n_threads = inst.n_threads;
|
||||
f32_kv = inst.f32_kv;
|
||||
type_k = inst.type_k;
|
||||
type_v = inst.type_v;
|
||||
n_gpu_layers = inst.n_gpu_layers;
|
||||
main_gpu = inst.main_gpu;
|
||||
mul_mat_q = inst.mul_mat_q;
|
||||
|
@ -572,7 +640,7 @@ struct test {
|
|||
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
||||
"cpu_info", "gpu_info",
|
||||
"model_filename", "model_type", "model_size", "model_n_params",
|
||||
"n_batch", "n_threads", "f16_kv",
|
||||
"n_batch", "n_threads", "type_k", "type_v",
|
||||
"n_gpu_layers", "main_gpu", "mul_mat_q", "tensor_split",
|
||||
"n_prompt", "n_gen", "test_time",
|
||||
"avg_ns", "stddev_ns",
|
||||
|
@ -622,7 +690,7 @@ struct test {
|
|||
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||
cpu_info, gpu_info,
|
||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
|
||||
std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
||||
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), tensor_split_str,
|
||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
||||
|
@ -806,8 +874,11 @@ struct markdown_printer : public printer {
|
|||
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
||||
fields.push_back("n_batch");
|
||||
}
|
||||
if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
|
||||
fields.push_back("f16_kv");
|
||||
if (params.type_k.size() > 1 || params.type_k != cmd_params_defaults.type_k) {
|
||||
fields.push_back("type_k");
|
||||
}
|
||||
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
|
||||
fields.push_back("type_v");
|
||||
}
|
||||
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
|
||||
fields.push_back("main_gpu");
|
||||
|
|
|
@ -11,6 +11,8 @@ actor LlamaContext {
|
|||
private var context: OpaquePointer
|
||||
private var batch: llama_batch
|
||||
private var tokens_list: [llama_token]
|
||||
/// This variable is used to store temporarily invalid cchars
|
||||
private var temporary_invalid_cchars: [CChar]
|
||||
|
||||
var n_len: Int32 = 512
|
||||
var n_cur: Int32 = 0
|
||||
|
@ -21,6 +23,7 @@ actor LlamaContext {
|
|||
self.context = context
|
||||
self.tokens_list = []
|
||||
self.batch = llama_batch_init(512, 0, 1)
|
||||
self.temporary_invalid_cchars = []
|
||||
}
|
||||
|
||||
deinit {
|
||||
|
@ -61,6 +64,7 @@ actor LlamaContext {
|
|||
print("attempting to complete \"\(text)\"")
|
||||
|
||||
tokens_list = tokenize(text: text, add_bos: true)
|
||||
temporary_invalid_cchars = []
|
||||
|
||||
let n_ctx = llama_n_ctx(context)
|
||||
let n_kv_req = tokens_list.count + (Int(n_len) - tokens_list.count)
|
||||
|
@ -72,7 +76,7 @@ actor LlamaContext {
|
|||
}
|
||||
|
||||
for id in tokens_list {
|
||||
print(token_to_piece(token: id))
|
||||
print(String(cString: token_to_piece(token: id) + [0]))
|
||||
}
|
||||
|
||||
// batch = llama_batch_init(512, 0) // done in init()
|
||||
|
@ -115,10 +119,25 @@ actor LlamaContext {
|
|||
|
||||
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
||||
print("\n")
|
||||
return ""
|
||||
let new_token_str = String(cString: temporary_invalid_cchars + [0])
|
||||
temporary_invalid_cchars.removeAll()
|
||||
return new_token_str
|
||||
}
|
||||
|
||||
let new_token_str = token_to_piece(token: new_token_id)
|
||||
let new_token_cchars = token_to_piece(token: new_token_id)
|
||||
temporary_invalid_cchars.append(contentsOf: new_token_cchars)
|
||||
let new_token_str: String
|
||||
if let string = String(validatingUTF8: temporary_invalid_cchars + [0]) {
|
||||
temporary_invalid_cchars.removeAll()
|
||||
new_token_str = string
|
||||
} else if (0 ..< temporary_invalid_cchars.count).contains(where: {$0 != 0 && String(validatingUTF8: Array(temporary_invalid_cchars.suffix($0)) + [0]) != nil}) {
|
||||
// in this case, at least the suffix of the temporary_invalid_cchars can be interpreted as UTF8 string
|
||||
let string = String(cString: temporary_invalid_cchars + [0])
|
||||
temporary_invalid_cchars.removeAll()
|
||||
new_token_str = string
|
||||
} else {
|
||||
new_token_str = ""
|
||||
}
|
||||
print(new_token_str)
|
||||
// tokens_list.append(new_token_id)
|
||||
|
||||
|
@ -144,12 +163,14 @@ actor LlamaContext {
|
|||
|
||||
func clear() {
|
||||
tokens_list.removeAll()
|
||||
temporary_invalid_cchars.removeAll()
|
||||
}
|
||||
|
||||
private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
|
||||
let n_tokens = text.count + (add_bos ? 1 : 0)
|
||||
let utf8Count = text.utf8.count
|
||||
let n_tokens = utf8Count + (add_bos ? 1 : 0)
|
||||
let tokens = UnsafeMutablePointer<llama_token>.allocate(capacity: n_tokens)
|
||||
let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos, false)
|
||||
let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, false)
|
||||
|
||||
var swiftTokens: [llama_token] = []
|
||||
for i in 0..<tokenCount {
|
||||
|
@ -161,7 +182,8 @@ actor LlamaContext {
|
|||
return swiftTokens
|
||||
}
|
||||
|
||||
private func token_to_piece(token: llama_token) -> String {
|
||||
/// - note: The result does not contain null-terminator
|
||||
private func token_to_piece(token: llama_token) -> [CChar] {
|
||||
let result = UnsafeMutablePointer<Int8>.allocate(capacity: 8)
|
||||
result.initialize(repeating: Int8(0), count: 8)
|
||||
defer {
|
||||
|
@ -175,10 +197,12 @@ actor LlamaContext {
|
|||
defer {
|
||||
newResult.deallocate()
|
||||
}
|
||||
_ = llama_token_to_piece(model, token, newResult, -nTokens)
|
||||
return String(cString: newResult)
|
||||
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens)
|
||||
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
|
||||
return Array(bufferPointer)
|
||||
} else {
|
||||
return String(cString: result)
|
||||
let bufferPointer = UnsafeBufferPointer(start: result, count: Int(nTokens))
|
||||
return Array(bufferPointer)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -438,6 +438,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str());
|
||||
LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str());
|
||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
LOG_TEE("\n\n");
|
||||
|
||||
|
|
|
@ -322,7 +322,6 @@ int main(int argc, char ** argv) {
|
|||
auto cparams = llama_context_default_params();
|
||||
cparams.n_ctx = 256;
|
||||
cparams.seed = 1;
|
||||
cparams.f16_kv = false;
|
||||
|
||||
ctx = llama_new_context_with_model(model, cparams);
|
||||
|
||||
|
|
|
@ -2109,10 +2109,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
}
|
||||
params.yarn_beta_slow = std::stof(argv[i]);
|
||||
}
|
||||
else if (arg == "--memory-f32" || arg == "--memory_f32")
|
||||
{
|
||||
params.memory_f16 = false;
|
||||
}
|
||||
else if (arg == "--threads" || arg == "-t")
|
||||
{
|
||||
if (++i >= argc)
|
||||
|
@ -2388,6 +2384,7 @@ json oaicompat_completion_params_parse(
|
|||
|
||||
// Map OpenAI parameters to llama.cpp parameters
|
||||
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.8);
|
||||
llama_params["top_k"] = json_value(body, "top_k", 40);
|
||||
llama_params["top_p"] = json_value(body, "top_p", 0.95);
|
||||
|
|
|
@ -75,7 +75,7 @@ int main(int argc, char ** argv) {
|
|||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
||||
if (n_kv_req > n_ctx) {
|
||||
LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
||||
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
|
||||
LOG_TEE("%s: either reduce n_len or increase n_ctx\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
|
|
@ -205,8 +205,9 @@ int main(int argc, char ** argv) {
|
|||
|
||||
const std::string token_str = llama_token_to_piece(ctx_tgt, id);
|
||||
|
||||
printf("%s", token_str.c_str());
|
||||
fflush(stdout);
|
||||
if (!params.use_color) {
|
||||
printf("%s", token_str.c_str());
|
||||
}
|
||||
|
||||
if (id == llama_token_eos(model_tgt)) {
|
||||
has_eos = true;
|
||||
|
@ -238,10 +239,18 @@ int main(int argc, char ** argv) {
|
|||
++n_past_tgt;
|
||||
++n_past_dft;
|
||||
++i_dft;
|
||||
|
||||
if (params.use_color) {
|
||||
// Color token according to its origin sequence
|
||||
printf("\u001b[%dm%s\u001b[37m", (36 - s_keep % 6), token_str.c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (params.use_color) {
|
||||
printf("%s", token_str.c_str());
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
LOG("the sampled target token (%d, '%s') did not match, or we ran out of drafted tokens\n", id, token_str.c_str());
|
||||
|
||||
|
|
|
@ -1295,10 +1295,6 @@ int main(int argc, char ** argv) {
|
|||
opt_cb_data.last_save_iter = opt->iter;
|
||||
}
|
||||
|
||||
if (alloc) {
|
||||
ggml_allocr_free(alloc);
|
||||
}
|
||||
|
||||
ggml_free(opt->ctx);
|
||||
free_train_state(train);
|
||||
ggml_free(model.ctx);
|
||||
|
|
1
expose.h
1
expose.h
|
@ -28,7 +28,6 @@ struct load_model_inputs
|
|||
const int blasthreads;
|
||||
const int max_context_length;
|
||||
const int batch_size;
|
||||
const bool f16_kv;
|
||||
const bool low_vram;
|
||||
const bool use_mmq;
|
||||
const char * executable_path;
|
||||
|
|
49
ggml-alloc.c
49
ggml-alloc.c
|
@ -168,10 +168,6 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor *
|
|||
size = aligned_offset(NULL, size, alloc->alignment);
|
||||
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
||||
|
||||
if (!alloc->measure) {
|
||||
ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
|
||||
}
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
remove_allocated_tensor(alloc, tensor);
|
||||
#endif
|
||||
|
@ -237,7 +233,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) {
|
|||
}
|
||||
|
||||
ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
|
||||
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
|
||||
struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
|
||||
|
||||
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
|
||||
|
||||
|
@ -449,7 +445,6 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
|
|||
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
|
||||
ggml_tallocr_t alloc = node_tallocr(galloc, view);
|
||||
|
||||
//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
|
||||
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
||||
if (update_backend) {
|
||||
view->backend = view->view_src->backend;
|
||||
|
@ -459,7 +454,7 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
|
|||
|
||||
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
||||
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
||||
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend);
|
||||
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
||||
|
||||
if (!alloc->measure) {
|
||||
ggml_backend_buffer_init_tensor(alloc->buffer, view);
|
||||
|
@ -765,3 +760,43 @@ size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
|
|||
size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
|
||||
return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
|
||||
}
|
||||
|
||||
// utils
|
||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
|
||||
GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
|
||||
|
||||
size_t alignment = ggml_backend_buft_get_alignment(buft);
|
||||
|
||||
size_t nbytes = 0;
|
||||
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->data == NULL && t->view_src == NULL) {
|
||||
nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
|
||||
}
|
||||
}
|
||||
|
||||
if (nbytes == 0) {
|
||||
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
|
||||
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
|
||||
|
||||
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||
if (t->data == NULL) {
|
||||
if (t->view_src == NULL) {
|
||||
ggml_tallocr_alloc(tallocr, t);
|
||||
} else {
|
||||
ggml_backend_view_init(buffer, t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ggml_tallocr_free(tallocr);
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend) {
|
||||
return ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_get_default_buffer_type(backend));
|
||||
}
|
||||
|
|
|
@ -8,6 +8,7 @@ extern "C" {
|
|||
|
||||
struct ggml_backend;
|
||||
struct ggml_backend_buffer;
|
||||
struct ggml_backend_buffer_type;
|
||||
|
||||
//
|
||||
// Legacy API
|
||||
|
@ -80,6 +81,12 @@ GGML_API void ggml_gallocr_alloc_graph_n(
|
|||
struct ggml_hash_set hash_set,
|
||||
ggml_tallocr_t * hash_node_talloc);
|
||||
|
||||
|
||||
// Utils
|
||||
// Create a buffer and allocate all the tensors in a ggml_context
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
|
||||
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -12,31 +12,50 @@ extern "C" {
|
|||
// Backend buffer
|
||||
//
|
||||
|
||||
// buffer type
|
||||
typedef void * ggml_backend_buffer_type_context_t;
|
||||
|
||||
struct ggml_backend_buffer_type_i {
|
||||
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer_type {
|
||||
struct ggml_backend_buffer_type_i iface;
|
||||
ggml_backend_buffer_type_context_t context;
|
||||
};
|
||||
|
||||
// buffer
|
||||
typedef void * ggml_backend_buffer_context_t;
|
||||
|
||||
struct ggml_backend_buffer_i {
|
||||
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
|
||||
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
|
||||
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
|
||||
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
|
||||
void (*free_buffer)(ggml_backend_buffer_t buffer);
|
||||
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
||||
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
};
|
||||
|
||||
struct ggml_backend_buffer {
|
||||
struct ggml_backend_buffer_i iface;
|
||||
|
||||
ggml_backend_t backend;
|
||||
struct ggml_backend_buffer_i iface;
|
||||
ggml_backend_buffer_type_t buft;
|
||||
ggml_backend_buffer_context_t context;
|
||||
|
||||
size_t size;
|
||||
};
|
||||
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
struct ggml_backend * backend,
|
||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
ggml_backend_buffer_type_t buft,
|
||||
struct ggml_backend_buffer_i iface,
|
||||
ggml_backend_buffer_context_t context,
|
||||
size_t size);
|
||||
|
||||
|
||||
//
|
||||
// Backend
|
||||
//
|
||||
|
@ -49,20 +68,17 @@ extern "C" {
|
|||
void (*free)(ggml_backend_t backend);
|
||||
|
||||
// buffer allocation
|
||||
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
|
||||
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
||||
|
||||
// get buffer alignment
|
||||
size_t (*get_alignment)(ggml_backend_t backend);
|
||||
|
||||
// tensor data access
|
||||
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
|
||||
// (optional) asynchroneous tensor data access
|
||||
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
void (*synchronize) (ggml_backend_t backend);
|
||||
|
||||
// (optional) copy tensor between different backends, allow for single-copy tranfers
|
||||
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
// (optional) asynchroneous tensor copy
|
||||
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
|
||||
void (*synchronize) (ggml_backend_t backend);
|
||||
|
||||
// compute graph with a plan
|
||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||
|
@ -82,6 +98,15 @@ extern "C" {
|
|||
ggml_backend_context_t context;
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
//
|
||||
|
||||
typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
|
||||
|
||||
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
771
ggml-backend.c
771
ggml-backend.c
File diff suppressed because it is too large
Load diff
|
@ -7,41 +7,44 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
typedef void * ggml_backend_graph_plan_t;
|
||||
|
||||
//
|
||||
// Backend buffer
|
||||
//
|
||||
|
||||
struct ggml_backend_buffer;
|
||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||
// buffer type
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||
|
||||
// backend buffer functions
|
||||
// buffer
|
||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
||||
|
||||
//
|
||||
// Backend
|
||||
//
|
||||
|
||||
struct ggml_backend;
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
typedef void * ggml_backend_graph_plan_t;
|
||||
|
||||
GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_free(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
|
||||
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
||||
|
||||
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_tensor_set_async( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
@ -57,6 +60,7 @@ extern "C" {
|
|||
|
||||
// tensor copy between different backends
|
||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
|
@ -68,8 +72,23 @@ extern "C" {
|
|||
GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
|
||||
|
||||
// Create a backend buffer from an existing pointer
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||
|
||||
//
|
||||
// Backend registry
|
||||
//
|
||||
|
||||
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
|
||||
|
||||
GGML_API size_t ggml_backend_reg_get_count(void);
|
||||
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
|
||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
|
||||
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
|
||||
|
||||
//
|
||||
// Backend scheduler
|
||||
|
@ -131,6 +150,32 @@ extern "C" {
|
|||
ggml_backend_sched_t sched,
|
||||
struct ggml_cgraph * graph);
|
||||
|
||||
|
||||
//
|
||||
// Utils
|
||||
//
|
||||
|
||||
struct ggml_backend_graph_copy {
|
||||
ggml_backend_buffer_t buffer;
|
||||
struct ggml_context * ctx_allocated;
|
||||
struct ggml_context * ctx_unallocated;
|
||||
struct ggml_cgraph * graph;
|
||||
};
|
||||
|
||||
// Copy a graph to a different backend
|
||||
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
||||
|
||||
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||
|
||||
// Compare the output of two backends
|
||||
GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
||||
|
||||
// Tensor initialization
|
||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
1517
ggml-cuda.cu
1517
ggml-cuda.cu
File diff suppressed because it is too large
Load diff
10
ggml-cuda.h
10
ggml-cuda.h
|
@ -49,7 +49,15 @@ GGML_API int ggml_cuda_get_device_count(void);
|
|||
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
|
||||
// backend API
|
||||
GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
|
||||
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
||||
|
||||
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
|
||||
// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -232,7 +232,7 @@ bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml
|
|||
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
||||
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||
|
||||
// returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
||||
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
||||
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
|
||||
|
||||
// return index, asserts if table is full
|
||||
|
|
|
@ -99,6 +99,12 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
|||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||
|
||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
|
||||
// helper to check if the device supports a specific family
|
||||
// ideally, the user code should be doing these checks
|
||||
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
708
ggml-metal.m
708
ggml-metal.m
File diff suppressed because it is too large
Load diff
947
ggml-metal.metal
947
ggml-metal.metal
File diff suppressed because it is too large
Load diff
414
ggml.c
414
ggml.c
|
@ -233,24 +233,6 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|||
#define UNUSED GGML_UNUSED
|
||||
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
||||
|
||||
//
|
||||
// tensor access macros
|
||||
//
|
||||
|
||||
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
|
||||
|
@ -1613,6 +1595,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"GROUP_NORM",
|
||||
|
||||
"MUL_MAT",
|
||||
"MUL_MAT_ID",
|
||||
"OUT_PROD",
|
||||
|
||||
"SCALE",
|
||||
|
@ -1640,6 +1623,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"POOL_1D",
|
||||
"POOL_2D",
|
||||
"UPSCALE",
|
||||
"ARGSORT",
|
||||
|
||||
"FLASH_ATTN",
|
||||
"FLASH_FF",
|
||||
|
@ -1666,7 +1650,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"CROSS_ENTROPY_LOSS_BACK",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
||||
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
|
@ -1695,6 +1679,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"group_norm(x)",
|
||||
|
||||
"X*Y",
|
||||
"X[i]*Y",
|
||||
"X*Y",
|
||||
|
||||
"x*v",
|
||||
|
@ -1722,6 +1707,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"pool_1d(x)",
|
||||
"pool_2d(x)",
|
||||
"upscale(x)",
|
||||
"argsort(x)",
|
||||
|
||||
"flash_attn(x)",
|
||||
"flash_ff(x)",
|
||||
|
@ -1748,10 +1734,28 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"cross_entropy_loss_back(x,y)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 68, "GGML_OP_COUNT != 68");
|
||||
static_assert(GGML_OP_COUNT == 70, "GGML_OP_COUNT != 70");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
|
||||
static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
||||
"ABS",
|
||||
"SGN",
|
||||
"NEG",
|
||||
"STEP",
|
||||
"TANH",
|
||||
"ELU",
|
||||
"RELU",
|
||||
"GELU",
|
||||
"GELU_QUICK",
|
||||
"SILU",
|
||||
"LEAKY",
|
||||
};
|
||||
|
||||
static_assert(GGML_UNARY_OP_COUNT == 11, "GGML_UNARY_OP_COUNT != 11");
|
||||
|
||||
|
||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
||||
|
||||
|
@ -1771,6 +1775,7 @@ static void ggml_setup_op_has_task_pass(void) {
|
|||
|
||||
p[GGML_OP_ACC ] = true;
|
||||
p[GGML_OP_MUL_MAT ] = true;
|
||||
p[GGML_OP_MUL_MAT_ID ] = true;
|
||||
p[GGML_OP_OUT_PROD ] = true;
|
||||
p[GGML_OP_SET ] = true;
|
||||
p[GGML_OP_GET_ROWS_BACK ] = true;
|
||||
|
@ -2023,6 +2028,20 @@ const char * ggml_op_symbol(enum ggml_op op) {
|
|||
return GGML_OP_SYMBOL[op];
|
||||
}
|
||||
|
||||
const char * ggml_unary_op_name(enum ggml_unary_op op) {
|
||||
return GGML_UNARY_OP_NAME[op];
|
||||
}
|
||||
|
||||
const char * ggml_op_desc(const struct ggml_tensor * t) {
|
||||
if (t->op == GGML_OP_UNARY) {
|
||||
enum ggml_unary_op uop = ggml_get_unary_op(t);
|
||||
return ggml_unary_op_name(uop);
|
||||
}
|
||||
else {
|
||||
return ggml_op_name(t->op);
|
||||
}
|
||||
}
|
||||
|
||||
size_t ggml_element_size(const struct ggml_tensor * tensor) {
|
||||
return ggml_type_size(tensor->type);
|
||||
}
|
||||
|
@ -3154,9 +3173,7 @@ static struct ggml_tensor * ggml_add_impl(
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
bool inplace) {
|
||||
// TODO: support less-strict constraint
|
||||
// GGML_ASSERT(ggml_can_repeat(b, a));
|
||||
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
||||
GGML_ASSERT(ggml_can_repeat(b, a));
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
|
@ -3371,9 +3388,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
bool inplace) {
|
||||
// TODO: support less-strict constraint
|
||||
// GGML_ASSERT(ggml_can_repeat(b, a));
|
||||
GGML_ASSERT(ggml_can_repeat_rows(b, a));
|
||||
GGML_ASSERT(ggml_can_repeat(b, a));
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
|
@ -3418,7 +3433,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
bool inplace) {
|
||||
GGML_ASSERT(ggml_are_same_shape(a, b));
|
||||
GGML_ASSERT(ggml_can_repeat(b, a));
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
|
@ -4056,6 +4071,49 @@ struct ggml_tensor * ggml_mul_mat(
|
|||
return result;
|
||||
}
|
||||
|
||||
// ggml_mul_mat_id
|
||||
|
||||
struct ggml_tensor * ggml_mul_mat_id(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * as[],
|
||||
struct ggml_tensor * ids,
|
||||
int id,
|
||||
struct ggml_tensor * b) {
|
||||
|
||||
int64_t n_as = ids->ne[0];
|
||||
|
||||
GGML_ASSERT(ids->type == GGML_TYPE_I32);
|
||||
GGML_ASSERT(ggml_is_vector(ids));
|
||||
GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2);
|
||||
GGML_ASSERT(id >= 0 && id < n_as);
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (as[0]->grad || b->grad) {
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
|
||||
|
||||
ggml_set_op_params_i32(result, 0, id);
|
||||
|
||||
result->op = GGML_OP_MUL_MAT_ID;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = ids;
|
||||
result->src[1] = b;
|
||||
|
||||
for (int64_t i = 0; i < n_as; i++) {
|
||||
struct ggml_tensor * a = as[i];
|
||||
GGML_ASSERT(ggml_are_same_shape(as[0], a));
|
||||
GGML_ASSERT(ggml_can_mul_mat(a, b));
|
||||
GGML_ASSERT(!ggml_is_transposed(a));
|
||||
result->src[i + 2] = a;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_out_prod
|
||||
|
||||
struct ggml_tensor * ggml_out_prod(
|
||||
|
@ -4209,7 +4267,7 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
|||
struct ggml_tensor * b,
|
||||
size_t nb1,
|
||||
size_t offset) {
|
||||
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
|
||||
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
|
||||
}
|
||||
|
||||
// ggml_cpy
|
||||
|
@ -5468,6 +5526,43 @@ struct ggml_tensor * ggml_upscale(
|
|||
return ggml_upscale_impl(ctx, a, scale_factor);
|
||||
}
|
||||
|
||||
// ggml_argsort
|
||||
|
||||
struct ggml_tensor * ggml_argsort(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_sort_order order) {
|
||||
bool is_node = false;
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, a->n_dims, a->ne);
|
||||
|
||||
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
||||
|
||||
result->op = GGML_OP_ARGSORT;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_top_k
|
||||
|
||||
struct ggml_tensor * ggml_top_k(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int k) {
|
||||
GGML_ASSERT(a->ne[0] >= k);
|
||||
|
||||
struct ggml_tensor * result = ggml_argsort(ctx, a, GGML_SORT_DESC);
|
||||
|
||||
result = ggml_view_4d(ctx, result,
|
||||
k, result->ne[1], result->ne[2], result->ne[3],
|
||||
result->nb[1], result->nb[2], result->nb[3],
|
||||
0);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ggml_flash_attn
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn(
|
||||
|
@ -6827,7 +6922,7 @@ static void ggml_compute_forward_add_f32(
|
|||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
|
@ -6860,16 +6955,19 @@ static void ggml_compute_forward_add_f32(
|
|||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
const int64_t nr0 = ne00 / ne10;
|
||||
|
||||
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
||||
|
||||
for (int64_t r = 0; r < nr0; ++r) {
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
|
||||
vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
||||
#else
|
||||
ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
|
||||
ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// src1 is not contiguous
|
||||
|
@ -6886,8 +6984,9 @@ static void ggml_compute_forward_add_f32(
|
|||
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
|
||||
for (int64_t i0 = 0; i0 < ne0; ++i0) {
|
||||
const int64_t i10 = i0 % ne10;
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
||||
|
||||
dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
|
||||
}
|
||||
|
@ -7607,7 +7706,7 @@ static void ggml_compute_forward_mul_f32(
|
|||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
|
@ -7630,7 +7729,6 @@ static void ggml_compute_forward_mul_f32(
|
|||
|
||||
GGML_ASSERT( nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(ne00 == ne10);
|
||||
|
||||
if (nb10 == sizeof(float)) {
|
||||
for (int64_t ir = ith; ir < nr; ir += nth) {
|
||||
|
@ -7642,20 +7740,21 @@ static void ggml_compute_forward_mul_f32(
|
|||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
const int64_t nr0 = ne00 / ne10;
|
||||
|
||||
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
||||
|
||||
for (int64_t r = 0 ; r < nr0; ++r) {
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
UNUSED(ggml_vec_mul_f32);
|
||||
UNUSED(ggml_vec_mul_f32);
|
||||
|
||||
vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
|
||||
vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
|
||||
#else
|
||||
ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
|
||||
ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
||||
#endif
|
||||
// }
|
||||
// }
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// src1 is not contiguous
|
||||
|
@ -7673,8 +7772,9 @@ static void ggml_compute_forward_mul_f32(
|
|||
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
|
||||
for (int64_t i0 = 0; i0 < ne00; i0++) {
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
|
||||
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
||||
const int64_t i10 = i0 % ne10;
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
||||
|
||||
dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
|
||||
}
|
||||
|
@ -7708,14 +7808,16 @@ static void ggml_compute_forward_div_f32(
|
|||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
assert(params->ith == 0);
|
||||
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
|
||||
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int nr = ggml_nrows(src0);
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
|
@ -7723,41 +7825,50 @@ static void ggml_compute_forward_div_f32(
|
|||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
|
||||
if (nb10 == sizeof(float)) {
|
||||
for (int ir = 0; ir < nr; ++ir) {
|
||||
// src0, src1 and dst are same shape => same indices
|
||||
const int i3 = ir/(ne2*ne1);
|
||||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
for (int64_t ir = ith; ir < nr; ir += nth) {
|
||||
// src0 and dst are same shape => same indices
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
const int64_t nr0 = ne00 / ne10;
|
||||
|
||||
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
||||
|
||||
for (int64_t r = 0; r < nr0; ++r) {
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
UNUSED(ggml_vec_div_f32);
|
||||
UNUSED(ggml_vec_div_f32);
|
||||
|
||||
vDSP_vdiv(
|
||||
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
||||
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
|
||||
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ), 1,
|
||||
ne0);
|
||||
vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
||||
#else
|
||||
ggml_vec_div_f32(ne0,
|
||||
(float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ),
|
||||
(float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
|
||||
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
|
||||
ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
||||
#endif
|
||||
// }
|
||||
// }
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// src1 is not contiguous
|
||||
for (int ir = 0; ir < nr; ++ir) {
|
||||
// src0, src1 and dst are same shape => same indices
|
||||
const int i3 = ir/(ne2*ne1);
|
||||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
for (int64_t ir = ith; ir < nr; ir += nth) {
|
||||
// src0 and dst are same shape => same indices
|
||||
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 );
|
||||
float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
|
||||
float * dst_ptr = (float *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
|
||||
for (int64_t i0 = 0; i0 < ne00; ++i0) {
|
||||
const int64_t i10 = i0 % ne10;
|
||||
float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
|
||||
|
||||
dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
|
||||
}
|
||||
|
@ -8203,7 +8314,7 @@ static void ggml_compute_forward_repeat_f16(
|
|||
return;
|
||||
}
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
// guaranteed to be an integer due to the check in ggml_can_repeat
|
||||
const int nr0 = (int)(ne0/ne00);
|
||||
|
@ -8348,6 +8459,7 @@ static void ggml_compute_forward_concat_f32(
|
|||
GGML_ASSERT(src0->nb[0] == sizeof(float));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
|
@ -8357,7 +8469,7 @@ static void ggml_compute_forward_concat_f32(
|
|||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
for (int i3 = 0; i3 < ne3; i3++) {
|
||||
for (int i2 = ith; i2 < ne2; i2++) {
|
||||
for (int i2 = ith; i2 < ne2; i2 += nth) {
|
||||
if (i2 < ne02) { // src0
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
|
@ -9517,6 +9629,8 @@ static void ggml_compute_forward_mul_mat(
|
|||
char * wdata = params->wdata;
|
||||
const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
|
||||
|
||||
assert(params->wsize >= ne11*ne12*ne13*row_size);
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
||||
|
@ -9618,6 +9732,26 @@ static void ggml_compute_forward_mul_mat(
|
|||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_mul_mat_id
|
||||
|
||||
static void ggml_compute_forward_mul_mat_id(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * ids = dst->src[0];
|
||||
const struct ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
const int id = ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
const int a_id = ((int32_t *)ids->data)[id];
|
||||
|
||||
GGML_ASSERT(a_id >= 0 && a_id < ids->ne[0]);
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[a_id + 2];
|
||||
|
||||
ggml_compute_forward_mul_mat(params, src0, src1, dst);
|
||||
}
|
||||
|
||||
// ggml_compute_forward_out_prod
|
||||
|
||||
static void ggml_compute_forward_out_prod_f32(
|
||||
|
@ -12021,6 +12155,67 @@ static void ggml_compute_forward_upscale(
|
|||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_argsort
|
||||
|
||||
static void ggml_compute_forward_argsort_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
enum ggml_sort_order order = (enum ggml_sort_order) ggml_get_op_params_i32(dst, 0);
|
||||
|
||||
for (int64_t i = ith; i < nr; i += nth) {
|
||||
int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
|
||||
const float * src_data = (float *)((char *) src0->data + i*nb01);
|
||||
|
||||
for (int64_t j = 0; j < ne0; j++) {
|
||||
dst_data[j] = j;
|
||||
}
|
||||
|
||||
// C doesn't have a functional sort, so we do a bubble sort instead
|
||||
for (int64_t j = 0; j < ne0; j++) {
|
||||
for (int64_t k = j + 1; k < ne0; k++) {
|
||||
if ((order == GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
|
||||
(order == GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
|
||||
int32_t tmp = dst_data[j];
|
||||
dst_data[j] = dst_data[k];
|
||||
dst_data[k] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_argsort(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_argsort_f32(params, src0, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_flash_attn
|
||||
|
||||
static void ggml_compute_forward_flash_attn_f32(
|
||||
|
@ -13844,6 +14039,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
{
|
||||
ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
ggml_compute_forward_mul_mat_id(params, tensor);
|
||||
} break;
|
||||
case GGML_OP_OUT_PROD:
|
||||
{
|
||||
ggml_compute_forward_out_prod(params, tensor->src[0], tensor->src[1], tensor);
|
||||
|
@ -13948,6 +14147,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
{
|
||||
ggml_compute_forward_upscale(params, tensor->src[0], tensor);
|
||||
} break;
|
||||
case GGML_OP_ARGSORT:
|
||||
{
|
||||
ggml_compute_forward_argsort(params, tensor->src[0], tensor);
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
const int32_t t = ggml_get_op_params_i32(tensor, 0);
|
||||
|
@ -14598,6 +14801,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||
zero_table);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_OUT_PROD:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
|
@ -14936,6 +15143,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_ARGSORT:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
struct ggml_tensor * flash_grad = NULL;
|
||||
|
@ -15296,12 +15507,8 @@ struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
|
|||
return ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, false);
|
||||
}
|
||||
|
||||
struct ggml_cgraph * ggml_graph_view(struct ggml_context * ctx, struct ggml_cgraph * cgraph0, int i0, int i1) {
|
||||
const size_t obj_size = sizeof(struct ggml_cgraph);
|
||||
struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, obj_size);
|
||||
struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
|
||||
|
||||
*cgraph = (struct ggml_cgraph) {
|
||||
struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) {
|
||||
struct ggml_cgraph cgraph = {
|
||||
/*.size =*/ 0,
|
||||
/*.n_nodes =*/ i1 - i0,
|
||||
/*.n_leafs =*/ 0,
|
||||
|
@ -15536,7 +15743,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_SUB:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_SQR:
|
||||
case GGML_OP_SQRT:
|
||||
case GGML_OP_LOG:
|
||||
|
@ -15569,10 +15775,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
break;
|
||||
case GGML_OP_SILU_BACK:
|
||||
case GGML_OP_MUL:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_NORM:
|
||||
case GGML_OP_RMS_NORM:
|
||||
case GGML_OP_RMS_NORM_BACK:
|
||||
|
@ -15610,6 +15819,11 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
}
|
||||
#endif
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
// FIXME: blas
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_OUT_PROD:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
|
@ -15669,6 +15883,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_ARGSORT:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_FLASH_ATTN:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
|
@ -15731,6 +15949,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
{
|
||||
n_tasks = 1;
|
||||
} break;
|
||||
case GGML_OP_COUNT:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: op not implemented: ", __func__);
|
||||
|
@ -15923,6 +16145,23 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|||
cur = ggml_type_size(vec_dot_type)*ggml_nelements(node->src[1])/ggml_blck_size(vec_dot_type);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
{
|
||||
const struct ggml_tensor * a = node->src[2];
|
||||
const struct ggml_tensor * b = node->src[1];
|
||||
const enum ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
|
||||
if (a->type != GGML_TYPE_F32) {
|
||||
// here we need memory just for single 2D matrix from src0
|
||||
cur = ggml_type_size(GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
if (b->type != vec_dot_type) {
|
||||
cur = ggml_type_size(vec_dot_type)*ggml_nelements(b)/ggml_blck_size(vec_dot_type);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_OUT_PROD:
|
||||
{
|
||||
if (ggml_is_quantized(node->src[0]->type)) {
|
||||
|
@ -15958,9 +16197,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|||
GGML_ASSERT(false);
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_IM2COL:
|
||||
{
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
const int64_t ne00 = node->src[0]->ne[0]; // W
|
||||
|
@ -17799,8 +18035,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
|
|||
memcpy(&qh, &y[i].qh, sizeof(qh));
|
||||
|
||||
for (int j = 0; j < QK5_0; j += 2) {
|
||||
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
||||
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
||||
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
||||
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
||||
|
||||
// cast to 16 bins
|
||||
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
||||
|
@ -17829,8 +18065,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
|
|||
memcpy(&qh, &y[i].qh, sizeof(qh));
|
||||
|
||||
for (int j = 0; j < QK5_1; j += 2) {
|
||||
const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
|
||||
const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
|
||||
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
||||
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
||||
|
||||
// cast to 16 bins
|
||||
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
||||
|
@ -18020,6 +18256,7 @@ struct gguf_kv {
|
|||
|
||||
struct gguf_header {
|
||||
char magic[4];
|
||||
|
||||
uint32_t version;
|
||||
uint64_t n_tensors; // GGUFv2
|
||||
uint64_t n_kv; // GGUFv2
|
||||
|
@ -18123,7 +18360,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
|
||||
for (uint32_t i = 0; i < sizeof(magic); i++) {
|
||||
if (magic[i] != GGUF_MAGIC[i]) {
|
||||
fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
|
||||
fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
|
||||
fclose(file);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -18138,7 +18375,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||
{
|
||||
strncpy(ctx->header.magic, magic, 4);
|
||||
|
||||
|
||||
ctx->kv = NULL;
|
||||
ctx->infos = NULL;
|
||||
ctx->data = NULL;
|
||||
|
|
53
ggml.h
53
ggml.h
|
@ -290,6 +290,20 @@
|
|||
const type prefix##3 = (pointer)->array[3]; \
|
||||
GGML_UNUSED(prefix##3);
|
||||
|
||||
#define GGML_TENSOR_UNARY_OP_LOCALS \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
#define GGML_TENSOR_BINARY_OP_LOCALS \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb) \
|
||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
@ -388,6 +402,7 @@ extern "C" {
|
|||
GGML_OP_GROUP_NORM,
|
||||
|
||||
GGML_OP_MUL_MAT,
|
||||
GGML_OP_MUL_MAT_ID,
|
||||
GGML_OP_OUT_PROD,
|
||||
|
||||
GGML_OP_SCALE,
|
||||
|
@ -414,8 +429,8 @@ extern "C" {
|
|||
GGML_OP_CONV_TRANSPOSE_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
GGML_OP_ARGSORT,
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_FF,
|
||||
|
@ -455,7 +470,9 @@ extern "C" {
|
|||
GGML_UNARY_OP_GELU,
|
||||
GGML_UNARY_OP_GELU_QUICK,
|
||||
GGML_UNARY_OP_SILU,
|
||||
GGML_UNARY_OP_LEAKY
|
||||
GGML_UNARY_OP_LEAKY,
|
||||
|
||||
GGML_UNARY_OP_COUNT,
|
||||
};
|
||||
|
||||
enum ggml_object_type {
|
||||
|
@ -638,6 +655,9 @@ extern "C" {
|
|||
GGML_API const char * ggml_op_name (enum ggml_op op);
|
||||
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
||||
|
||||
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
||||
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
||||
|
||||
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
||||
|
@ -1034,6 +1054,15 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// indirect matrix multiplication
|
||||
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
||||
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * as[],
|
||||
struct ggml_tensor * ids,
|
||||
int id,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// A: m columns, n rows,
|
||||
// B: p columns, n rows,
|
||||
// result is m columns, p rows
|
||||
|
@ -1527,6 +1556,23 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
int scale_factor);
|
||||
|
||||
// sort rows
|
||||
enum ggml_sort_order {
|
||||
GGML_SORT_ASC,
|
||||
GGML_SORT_DESC,
|
||||
};
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_argsort(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
enum ggml_sort_order order);
|
||||
|
||||
// top k elements per row
|
||||
GGML_API struct ggml_tensor * ggml_top_k(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int k);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
|
@ -1588,7 +1634,6 @@ extern "C" {
|
|||
int kh);
|
||||
|
||||
// used in sam
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_add_rel_pos(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
@ -1763,7 +1808,7 @@ extern "C" {
|
|||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_view (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
|
||||
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
|
||||
|
|
|
@ -679,7 +679,6 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
{
|
||||
blasbatchsize = 8;
|
||||
}
|
||||
params.memory_f16 = inputs.f16_kv;
|
||||
|
||||
auto clamped_max_context_length = inputs.max_context_length;
|
||||
|
||||
|
@ -768,7 +767,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
llama_ctx_params_v2.n_ctx = clamped_max_context_length;
|
||||
//llama_ctx_params.n_parts = -1;
|
||||
llama_ctx_params_v2.seed = -1;
|
||||
llama_ctx_params_v2.f16_kv = inputs.f16_kv;
|
||||
llama_ctx_params_v2.f16_kv = true;
|
||||
llama_ctx_params_v2.logits_all = false;
|
||||
llama_ctx_params_v2.use_mmap = inputs.use_mmap;
|
||||
llama_ctx_params_v2.use_mlock = inputs.use_mlock;
|
||||
|
@ -818,7 +817,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
llama_ctx_params.n_ctx = clamped_max_context_length;
|
||||
//llama_ctx_paran_parts = -1;
|
||||
llama_ctx_params.seed = -1;
|
||||
llama_ctx_params.f16_kv = inputs.f16_kv;
|
||||
llama_ctx_params.f16_kv = true;
|
||||
llama_ctx_params.low_vram = inputs.low_vram;
|
||||
llama_ctx_params.mul_mat_q = inputs.use_mmq;
|
||||
llama_ctx_params.logits_all = false;
|
||||
|
@ -895,7 +894,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
|
||||
//llama_ctx_paran_parts = -1;
|
||||
llama_ctx_params.seed = -1;
|
||||
llama_ctx_params.f16_kv = inputs.f16_kv;
|
||||
//llama_ctx_params.f16_kv = true;
|
||||
//llama_ctx_params.low_vram = inputs.low_vram;
|
||||
llama_ctx_params.mul_mat_q = inputs.use_mmq;
|
||||
llama_ctx_params.logits_all = false;
|
||||
|
|
|
@ -24,7 +24,6 @@ class load_model_inputs(ctypes.Structure):
|
|||
("blasthreads", ctypes.c_int),
|
||||
("max_context_length", ctypes.c_int),
|
||||
("batch_size", ctypes.c_int),
|
||||
("f16_kv", ctypes.c_bool),
|
||||
("low_vram", ctypes.c_bool),
|
||||
("use_mmq", ctypes.c_bool),
|
||||
("executable_path", ctypes.c_char_p),
|
||||
|
@ -235,7 +234,6 @@ def load_model(model_filename):
|
|||
inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False)
|
||||
inputs.use_mmq = (True if (args.usecublas and "mmq" in args.usecublas) else False)
|
||||
inputs.blasthreads = args.blasthreads
|
||||
inputs.f16_kv = True
|
||||
inputs.use_mmap = (not args.nommap)
|
||||
inputs.use_mlock = args.usemlock
|
||||
inputs.lora_filename = "".encode("UTF-8")
|
||||
|
|
33
llama.h
33
llama.h
|
@ -42,7 +42,7 @@
|
|||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||
|
||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 2
|
||||
#define LLAMA_SESSION_VERSION 3
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
|
@ -158,6 +158,22 @@ extern "C" {
|
|||
llama_seq_id all_seq_id; // used if seq_id == NULL
|
||||
} llama_batch;
|
||||
|
||||
enum llama_model_kv_override_type {
|
||||
LLAMA_KV_OVERRIDE_INT,
|
||||
LLAMA_KV_OVERRIDE_FLOAT,
|
||||
LLAMA_KV_OVERRIDE_BOOL,
|
||||
};
|
||||
|
||||
struct llama_model_kv_override {
|
||||
char key[128];
|
||||
enum llama_model_kv_override_type tag;
|
||||
union {
|
||||
int64_t int_value;
|
||||
double float_value;
|
||||
bool bool_value;
|
||||
};
|
||||
};
|
||||
|
||||
struct llama_model_params {
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||
|
@ -165,9 +181,13 @@ extern "C" {
|
|||
|
||||
// called with a progress value between 0 and 1, pass NULL to disable
|
||||
llama_progress_callback progress_callback;
|
||||
|
||||
// context pointer passed to the progress callback
|
||||
void * progress_callback_user_data;
|
||||
|
||||
// override key-value pairs of the model meta data
|
||||
const struct llama_model_kv_override * kv_overrides;
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
|
@ -191,11 +211,14 @@ extern "C" {
|
|||
float yarn_beta_slow; // YaRN high correction dim
|
||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||
|
||||
enum ggml_type type_k; // data type for K cache
|
||||
enum ggml_type type_v; // data type for V cache
|
||||
|
||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
||||
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
bool embedding; // embedding mode only
|
||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
bool embedding; // embedding mode only
|
||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
|
|
1357
tests/test-backend-ops.cpp
Normal file
1357
tests/test-backend-ops.cpp
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue