Merge branch 'ggerganov:master' into avx_optimizations
This commit is contained in:
commit
d635c75b85
70 changed files with 4901 additions and 3299 deletions
7
Makefile
7
Makefile
|
@ -925,6 +925,7 @@ OBJ_LLAMA = \
|
||||||
|
|
||||||
OBJ_COMMON = \
|
OBJ_COMMON = \
|
||||||
common/common.o \
|
common/common.o \
|
||||||
|
common/arg.o \
|
||||||
common/console.o \
|
common/console.o \
|
||||||
common/ngram-cache.o \
|
common/ngram-cache.o \
|
||||||
common/sampling.o \
|
common/sampling.o \
|
||||||
|
@ -1157,6 +1158,11 @@ common/common.o: \
|
||||||
include/llama.h
|
include/llama.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
common/arg.o: \
|
||||||
|
common/arg.cpp \
|
||||||
|
common/arg.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common/sampling.o: \
|
common/sampling.o: \
|
||||||
common/sampling.cpp \
|
common/sampling.cpp \
|
||||||
common/sampling.h \
|
common/sampling.h \
|
||||||
|
@ -1448,7 +1454,6 @@ llama-gen-docs: examples/gen-docs/gen-docs.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
./llama-gen-docs
|
|
||||||
|
|
||||||
libllava.a: examples/llava/llava.cpp \
|
libllava.a: examples/llava/llava.cpp \
|
||||||
examples/llava/llava.h \
|
examples/llava/llava.h \
|
||||||
|
|
|
@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
|
|
||||||
## Hot topics
|
## Hot topics
|
||||||
|
|
||||||
- *add hot topics here*
|
- Huggingface GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
|
||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
|
@ -163,6 +163,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||||
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
||||||
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
||||||
|
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,8 @@ add_library(${TARGET} STATIC
|
||||||
base64.hpp
|
base64.hpp
|
||||||
common.h
|
common.h
|
||||||
common.cpp
|
common.cpp
|
||||||
|
arg.h
|
||||||
|
arg.cpp
|
||||||
sampling.h
|
sampling.h
|
||||||
sampling.cpp
|
sampling.cpp
|
||||||
console.h
|
console.h
|
||||||
|
|
1987
common/arg.cpp
Normal file
1987
common/arg.cpp
Normal file
File diff suppressed because it is too large
Load diff
77
common/arg.h
Normal file
77
common/arg.h
Normal file
|
@ -0,0 +1,77 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <set>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
//
|
||||||
|
// CLI argument parsing
|
||||||
|
//
|
||||||
|
|
||||||
|
struct llama_arg {
|
||||||
|
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||||
|
std::vector<const char *> args;
|
||||||
|
const char * value_hint = nullptr; // help text or example for arg value
|
||||||
|
const char * value_hint_2 = nullptr; // for second arg value
|
||||||
|
const char * env = nullptr;
|
||||||
|
std::string help;
|
||||||
|
bool is_sparam = false; // is current arg a sampling param?
|
||||||
|
void (*handler_void) (gpt_params & params) = nullptr;
|
||||||
|
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
|
||||||
|
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
|
||||||
|
void (*handler_int) (gpt_params & params, int) = nullptr;
|
||||||
|
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const char * value_hint,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params, const std::string &)
|
||||||
|
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
||||||
|
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const char * value_hint,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params, int)
|
||||||
|
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
||||||
|
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params)
|
||||||
|
) : args(args), help(help), handler_void(handler) {}
|
||||||
|
|
||||||
|
// support 2 values for arg
|
||||||
|
llama_arg(
|
||||||
|
const std::initializer_list<const char *> & args,
|
||||||
|
const char * value_hint,
|
||||||
|
const char * value_hint_2,
|
||||||
|
const std::string & help,
|
||||||
|
void (*handler)(gpt_params & params, const std::string &, const std::string &)
|
||||||
|
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||||
|
|
||||||
|
llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||||
|
llama_arg & set_env(const char * env);
|
||||||
|
llama_arg & set_sparam();
|
||||||
|
bool in_example(enum llama_example ex);
|
||||||
|
bool get_value_from_env(std::string & output);
|
||||||
|
bool has_value_from_env();
|
||||||
|
std::string to_string();
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gpt_params_context {
|
||||||
|
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
||||||
|
gpt_params & params;
|
||||||
|
std::vector<llama_arg> options;
|
||||||
|
void(*print_usage)(int, char **) = nullptr;
|
||||||
|
gpt_params_context(gpt_params & params) : params(params) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
// parse input arguments from CLI
|
||||||
|
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||||
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
||||||
|
// function to be used by test-arg-parser
|
||||||
|
gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
1894
common/common.cpp
1894
common/common.cpp
File diff suppressed because it is too large
Load diff
215
common/common.h
215
common/common.h
|
@ -4,20 +4,11 @@
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include "sampling.h"
|
|
||||||
|
|
||||||
#define LOG_NO_FILE_LINE_FUNCTION
|
#define LOG_NO_FILE_LINE_FUNCTION
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <random>
|
|
||||||
#include <thread>
|
|
||||||
#include <set>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <tuple>
|
|
||||||
#include <functional>
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
#define DIRECTORY_SEPARATOR '\\'
|
#define DIRECTORY_SEPARATOR '\\'
|
||||||
|
@ -56,11 +47,20 @@ struct llama_control_vector_load_info;
|
||||||
// CPU utils
|
// CPU utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
struct cpu_params {
|
||||||
|
int n_threads = -1;
|
||||||
|
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||||
|
bool mask_valid = false; // Default: any CPU
|
||||||
|
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||||
|
bool strict_cpu = false; // Use strict CPU placement
|
||||||
|
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||||
|
};
|
||||||
|
|
||||||
int32_t cpu_get_num_physical_cores();
|
int32_t cpu_get_num_physical_cores();
|
||||||
int32_t cpu_get_num_math();
|
int32_t cpu_get_num_math();
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// Common params
|
||||||
//
|
//
|
||||||
|
|
||||||
enum llama_example {
|
enum llama_example {
|
||||||
|
@ -78,28 +78,71 @@ enum llama_example {
|
||||||
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
|
||||||
LLAMA_EXAMPLE_EXPORT_LORA,
|
LLAMA_EXAMPLE_EXPORT_LORA,
|
||||||
LLAMA_EXAMPLE_LLAVA,
|
LLAMA_EXAMPLE_LLAVA,
|
||||||
|
LLAMA_EXAMPLE_LOOKUP,
|
||||||
|
LLAMA_EXAMPLE_PARALLEL,
|
||||||
|
|
||||||
LLAMA_EXAMPLE_COUNT,
|
LLAMA_EXAMPLE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum gpt_sampler_type {
|
||||||
|
GPT_SAMPLER_TYPE_NONE = 0,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_K = 1,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_P = 2,
|
||||||
|
GPT_SAMPLER_TYPE_MIN_P = 3,
|
||||||
|
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
||||||
|
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||||
|
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||||
|
};
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
enum dimre_method {
|
enum dimre_method {
|
||||||
DIMRE_METHOD_PCA,
|
DIMRE_METHOD_PCA,
|
||||||
DIMRE_METHOD_MEAN,
|
DIMRE_METHOD_MEAN,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct cpu_params {
|
// sampler parameters
|
||||||
int n_threads = -1;
|
struct gpt_sampler_params {
|
||||||
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||||
bool mask_valid = false; // Default: any CPU
|
|
||||||
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
bool strict_cpu = false; // Use strict CPU placement
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
||||||
|
int32_t top_k = 40; // <= 0 to use vocab size
|
||||||
|
float top_p = 0.95f; // 1.0 = disabled
|
||||||
|
float min_p = 0.05f; // 0.0 = disabled
|
||||||
|
float tfs_z = 1.00f; // 1.0 = disabled
|
||||||
|
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
||||||
|
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||||
|
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||||
|
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||||
|
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
|
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||||
|
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||||
|
float penalty_present = 0.00f; // 0.0 = disabled
|
||||||
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
|
bool ignore_eos = false;
|
||||||
|
|
||||||
|
std::vector<enum gpt_sampler_type> samplers = {
|
||||||
|
GPT_SAMPLER_TYPE_TOP_K,
|
||||||
|
GPT_SAMPLER_TYPE_TFS_Z,
|
||||||
|
GPT_SAMPLER_TYPE_TYPICAL_P,
|
||||||
|
GPT_SAMPLER_TYPE_TOP_P,
|
||||||
|
GPT_SAMPLER_TYPE_MIN_P,
|
||||||
|
GPT_SAMPLER_TYPE_TEMPERATURE
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
|
||||||
|
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
||||||
|
|
||||||
|
// print the parameters into a string
|
||||||
|
std::string print() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
enum llama_example curr_ex = LLAMA_EXAMPLE_COMMON;
|
|
||||||
|
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 0; // context size
|
int32_t n_ctx = 0; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
|
@ -143,23 +186,23 @@ struct gpt_params {
|
||||||
|
|
||||||
struct gpt_sampler_params sparams;
|
struct gpt_sampler_params sparams;
|
||||||
|
|
||||||
std::string model = ""; // model path
|
std::string model = ""; // model path // NOLINT
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
||||||
std::string model_alias = "unknown"; // model alias
|
std::string model_alias = "unknown"; // model alias // NOLINT
|
||||||
std::string model_url = ""; // model url to download
|
std::string model_url = ""; // model url to download // NOLINT
|
||||||
std::string hf_token = ""; // HF token
|
std::string hf_token = ""; // HF token // NOLINT
|
||||||
std::string hf_repo = ""; // HF repo
|
std::string hf_repo = ""; // HF repo // NOLINT
|
||||||
std::string hf_file = ""; // HF file
|
std::string hf_file = ""; // HF file // NOLINT
|
||||||
std::string prompt = "";
|
std::string prompt = ""; // NOLINT
|
||||||
std::string prompt_file = ""; // store the external prompt file name
|
std::string prompt_file = ""; // store the external prompt file name // NOLINT
|
||||||
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
|
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
|
||||||
std::string input_prefix = ""; // string to prefix user inputs with
|
std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
|
||||||
std::string input_suffix = ""; // string to suffix user inputs with
|
std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
|
||||||
std::string logdir = ""; // directory in which to save YAML log files
|
std::string logdir = ""; // directory in which to save YAML log files // NOLINT
|
||||||
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
|
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
|
||||||
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
|
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
|
||||||
std::string logits_file = ""; // file for saving *all* logits
|
std::string logits_file = ""; // file for saving *all* logits // NOLINT
|
||||||
std::string rpc_servers = ""; // comma separated list of RPC servers
|
std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
|
||||||
|
|
||||||
std::vector<std::string> in_files; // all input files
|
std::vector<std::string> in_files; // all input files
|
||||||
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
|
||||||
|
@ -189,7 +232,6 @@ struct gpt_params {
|
||||||
|
|
||||||
bool kl_divergence = false; // compute KL divergence
|
bool kl_divergence = false; // compute KL divergence
|
||||||
|
|
||||||
std::function<void(int, char **)> print_usage = nullptr; // print example-specific usage and example
|
|
||||||
bool usage = false; // print usage
|
bool usage = false; // print usage
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool special = false; // enable special token output
|
bool special = false; // enable special token output
|
||||||
|
@ -211,7 +253,6 @@ struct gpt_params {
|
||||||
bool use_mlock = false; // use mlock to keep model in memory
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool display_prompt = true; // print prompt before generation
|
bool display_prompt = true; // print prompt before generation
|
||||||
bool infill = false; // use infill mode
|
|
||||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
bool no_kv_offload = false; // disable KV offloading
|
bool no_kv_offload = false; // disable KV offloading
|
||||||
bool warmup = true; // warmup run
|
bool warmup = true; // warmup run
|
||||||
|
@ -221,7 +262,7 @@ struct gpt_params {
|
||||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector // NOLINT
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
// embedding
|
// embedding
|
||||||
|
@ -237,15 +278,15 @@ struct gpt_params {
|
||||||
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||||
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = "";
|
std::string public_path = ""; // NOLINT
|
||||||
std::string chat_template = "";
|
std::string chat_template = ""; // NOLINT
|
||||||
std::string system_prompt = "";
|
std::string system_prompt = ""; // NOLINT
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
|
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
|
|
||||||
std::string ssl_file_key = "";
|
std::string ssl_file_key = ""; // NOLINT
|
||||||
std::string ssl_file_cert = "";
|
std::string ssl_file_cert = ""; // NOLINT
|
||||||
|
|
||||||
bool endpoint_slots = true;
|
bool endpoint_slots = true;
|
||||||
bool endpoint_metrics = false;
|
bool endpoint_metrics = false;
|
||||||
|
@ -300,92 +341,6 @@ struct gpt_params {
|
||||||
bool batched_bench_output_jsonl = false;
|
bool batched_bench_output_jsonl = false;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_arg {
|
|
||||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
|
||||||
std::vector<const char *> args;
|
|
||||||
const char * value_hint = nullptr; // help text or example for arg value
|
|
||||||
const char * value_hint_2 = nullptr; // for second arg value
|
|
||||||
const char * env = nullptr;
|
|
||||||
std::string help;
|
|
||||||
void (*handler_void) (gpt_params & params) = nullptr;
|
|
||||||
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
|
|
||||||
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
|
|
||||||
void (*handler_int) (gpt_params & params, int) = nullptr;
|
|
||||||
|
|
||||||
llama_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const char * value_hint,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(gpt_params & params, const std::string &)
|
|
||||||
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
|
||||||
|
|
||||||
llama_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const char * value_hint,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(gpt_params & params, int)
|
|
||||||
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
|
||||||
|
|
||||||
llama_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(gpt_params & params)
|
|
||||||
) : args(args), help(help), handler_void(handler) {}
|
|
||||||
|
|
||||||
// support 2 values for arg
|
|
||||||
llama_arg(
|
|
||||||
const std::initializer_list<const char *> & args,
|
|
||||||
const char * value_hint,
|
|
||||||
const char * value_hint_2,
|
|
||||||
const std::string & help,
|
|
||||||
void (*handler)(gpt_params & params, const std::string &, const std::string &)
|
|
||||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
|
||||||
|
|
||||||
llama_arg & set_examples(std::initializer_list<enum llama_example> examples) {
|
|
||||||
this->examples = std::move(examples);
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_arg & set_env(const char * env) {
|
|
||||||
help = help + "\n(env: " + env + ")";
|
|
||||||
this->env = env;
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool in_example(enum llama_example ex) {
|
|
||||||
return examples.find(ex) != examples.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool get_value_from_env(std::string & output) const {
|
|
||||||
if (env == nullptr) return false;
|
|
||||||
char * value = std::getenv(env);
|
|
||||||
if (value) {
|
|
||||||
output = value;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool has_value_from_env() const {
|
|
||||||
return env != nullptr && std::getenv(env);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string to_string();
|
|
||||||
};
|
|
||||||
|
|
||||||
// initialize list of options (arguments) that can be used by the current example
|
|
||||||
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex);
|
|
||||||
// optionally, we can provide "print_usage" to print example usage
|
|
||||||
std::vector<llama_arg> gpt_params_parser_init(gpt_params & params, llama_example ex, std::function<void(int, char **)> print_usage);
|
|
||||||
|
|
||||||
// parse input arguments from CLI
|
|
||||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
|
||||||
bool gpt_params_parse (int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
|
|
||||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params, std::vector<llama_arg> & options);
|
|
||||||
|
|
||||||
// print full usage message; it will be called internally by gpt_params_parse() if "-h" is set
|
|
||||||
void gpt_params_print_usage(gpt_params & params, std::vector<llama_arg> & options);
|
|
||||||
|
|
||||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||||
|
|
||||||
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
|
|
|
@ -2,6 +2,9 @@
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
// the ring buffer works similarly to std::deque, but with a fixed capacity
|
||||||
// TODO: deduplicate with llama-impl.h
|
// TODO: deduplicate with llama-impl.h
|
||||||
template<typename T>
|
template<typename T>
|
||||||
|
@ -307,6 +310,10 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
|
||||||
return cur_p.data[cur_p.selected].id;
|
return cur_p.data[cur_p.selected].id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
|
||||||
|
return llama_sampler_get_seed(gsmpl->chain);
|
||||||
|
}
|
||||||
|
|
||||||
// helpers
|
// helpers
|
||||||
|
|
||||||
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
|
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
|
||||||
|
@ -420,7 +427,7 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
||||||
std::unordered_map<char, gpt_sampler_type> sampler_name_map {
|
std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
||||||
|
|
|
@ -2,61 +2,11 @@
|
||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
enum gpt_sampler_type {
|
|
||||||
GPT_SAMPLER_TYPE_NONE = 0,
|
|
||||||
GPT_SAMPLER_TYPE_TOP_K = 1,
|
|
||||||
GPT_SAMPLER_TYPE_TOP_P = 2,
|
|
||||||
GPT_SAMPLER_TYPE_MIN_P = 3,
|
|
||||||
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
|
||||||
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
|
||||||
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
|
||||||
};
|
|
||||||
|
|
||||||
// sampling parameters
|
|
||||||
struct gpt_sampler_params {
|
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
|
||||||
|
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
|
||||||
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
|
|
||||||
int32_t top_k = 40; // <= 0 to use vocab size
|
|
||||||
float top_p = 0.95f; // 1.0 = disabled
|
|
||||||
float min_p = 0.05f; // 0.0 = disabled
|
|
||||||
float tfs_z = 1.00f; // 1.0 = disabled
|
|
||||||
float typ_p = 1.00f; // typical_p, 1.0 = disabled
|
|
||||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
|
||||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
|
||||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
|
||||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
|
||||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
|
||||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
|
||||||
float penalty_present = 0.00f; // 0.0 = disabled
|
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
|
||||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
|
||||||
bool ignore_eos = false;
|
|
||||||
|
|
||||||
std::vector<enum gpt_sampler_type> samplers = {
|
|
||||||
GPT_SAMPLER_TYPE_TOP_K,
|
|
||||||
GPT_SAMPLER_TYPE_TFS_Z,
|
|
||||||
GPT_SAMPLER_TYPE_TYPICAL_P,
|
|
||||||
GPT_SAMPLER_TYPE_TOP_P,
|
|
||||||
GPT_SAMPLER_TYPE_MIN_P,
|
|
||||||
GPT_SAMPLER_TYPE_TEMPERATURE
|
|
||||||
};
|
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
|
||||||
|
|
||||||
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
|
|
||||||
|
|
||||||
// print the parameters into a string
|
|
||||||
std::string print() const;
|
|
||||||
};
|
|
||||||
|
|
||||||
// gpt_sampler extends llama_sampler with additional functionality:
|
// gpt_sampler extends llama_sampler with additional functionality:
|
||||||
//
|
//
|
||||||
// - grammar support
|
// - grammar support
|
||||||
|
@ -110,6 +60,8 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
||||||
//
|
//
|
||||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||||
|
|
||||||
|
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
|
||||||
|
|
||||||
// helpers
|
// helpers
|
||||||
|
|
||||||
// access the internal list of current candidate tokens
|
// access the internal list of current candidate tokens
|
||||||
|
|
|
@ -302,6 +302,8 @@ class Model:
|
||||||
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
gguf.MODEL_TENSOR.TIME_MIX_FIRST,
|
||||||
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
gguf.MODEL_TENSOR.TIME_MIX_W1,
|
||||||
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
gguf.MODEL_TENSOR.TIME_MIX_W2,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
||||||
|
gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
or not new_name.endswith(".weight")
|
or not new_name.endswith(".weight")
|
||||||
|
|
|
@ -380,3 +380,9 @@ For detailed info, such as model/device supports, CANN install, please refer to
|
||||||
### Android
|
### Android
|
||||||
|
|
||||||
To read documentation for how to build on Android, [click here](./android.md)
|
To read documentation for how to build on Android, [click here](./android.md)
|
||||||
|
|
||||||
|
### Arm CPU optimized mulmat kernels
|
||||||
|
|
||||||
|
Llama.cpp includes a set of optimized mulmat kernels for the Arm architecture, leveraging Arm® Neon™, int8mm and SVE instructions. These kernels are enabled at build time through the appropriate compiler cpu-type flags, such as `-DCMAKE_C_FLAGS=-march=armv8.2a+i8mm+sve`. Note that these optimized kernels require the model to be quantized into one of the formats: `Q4_0_4_4` (Arm Neon), `Q4_0_4_8` (int8mm) or `Q4_0_8_8` (SVE). The SVE mulmat kernel specifically requires a vector width of 256 bits. When running on devices with a different vector width, it is recommended to use the `Q4_0_4_8` (int8mm) or `Q4_0_4_4` (Arm Neon) formats for better performance. Refer to [examples/quantize/README.md](../examples/quantize/README.md) for more information on the quantization formats.
|
||||||
|
|
||||||
|
To support `Q4_0_4_4`, you must build with `GGML_NO_LLAMAFILE=1` (`make`) or `-DGGML_LLAMAFILE=OFF` (`cmake`).
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -37,8 +38,7 @@ static void print_usage(int, char ** argv) {
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_BENCH, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -140,8 +140,6 @@ while n_cur <= n_len {
|
||||||
|
|
||||||
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
|
let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])
|
||||||
|
|
||||||
llama_sampler_accept(smpl, new_token_id)
|
|
||||||
|
|
||||||
// is it an end of stream? -> mark the stream as finished
|
// is it an end of stream? -> mark the stream as finished
|
||||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||||
i_batch[i] = -1
|
i_batch[i] = -1
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = "Hello my name is";
|
params.prompt = "Hello my name is";
|
||||||
params.n_predict = 32;
|
params.n_predict = 32;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -172,8 +172,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
|
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);
|
||||||
|
|
||||||
llama_sampler_accept(smpl, new_token_id);
|
|
||||||
|
|
||||||
// is it an end of generation? -> mark the stream as finished
|
// is it an end of generation? -> mark the stream as finished
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||||
i_batch[i] = -1;
|
i_batch[i] = -1;
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
@ -388,8 +389,7 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,12 +12,9 @@
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <random>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <tuple>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
|
||||||
#include <iostream>
|
|
||||||
#include <fstream>
|
|
||||||
|
|
||||||
#define DEBUG_POS 5
|
#define DEBUG_POS 5
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -79,8 +80,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EMBEDDING);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -90,8 +90,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
|
|
||||||
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
@ -144,8 +145,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-alloc.h"
|
#include "ggml-alloc.h"
|
||||||
|
@ -401,8 +402,7 @@ static void print_usage(int, char ** argv) {
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
@ -9,11 +10,11 @@ static void export_md(std::string fname, llama_example ex) {
|
||||||
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
auto options = gpt_params_parser_init(params, ex);
|
auto ctx_arg = gpt_params_parser_init(params, ex);
|
||||||
|
|
||||||
file << "| Argument | Explanation |\n";
|
file << "| Argument | Explanation |\n";
|
||||||
file << "| -------- | ----------- |\n";
|
file << "| -------- | ----------- |\n";
|
||||||
for (auto & opt : options) {
|
for (auto & opt : ctx_arg.options) {
|
||||||
file << "| `";
|
file << "| `";
|
||||||
// args
|
// args
|
||||||
for (const auto & arg : opt.args) {
|
for (const auto & arg : opt.args) {
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -121,7 +122,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
||||||
llama_decode(ctx, bat);
|
llama_decode(ctx, bat);
|
||||||
|
|
||||||
llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
|
llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
|
||||||
llama_sampler_accept(smpl, token);
|
|
||||||
|
|
||||||
if (token == eos_token) {
|
if (token == eos_token) {
|
||||||
break;
|
break;
|
||||||
|
@ -154,8 +154,7 @@ static std::string gritlm_instruction(const std::string & instruction) {
|
||||||
int main(int argc, char * argv[]) {
|
int main(int argc, char * argv[]) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -577,8 +578,7 @@ int main(int argc, char ** argv) {
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
params.verbosity = 1;
|
params.verbosity = 1;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -105,8 +106,7 @@ int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_INFILL);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,8 +159,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
|
|
||||||
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
|
||||||
|
|
||||||
LOG("%s: llama backend init\n", __func__);
|
LOG("%s: llama backend init\n", __func__);
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
@ -301,16 +299,14 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
smpl = gpt_sampler_init(model, sparams);
|
||||||
|
|
||||||
|
LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
|
||||||
LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
|
LOG_TEE("sampling: \n%s\n", sparams.print().c_str());
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
LOG_TEE("\n\n");
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
LOG_TEE("\n##### Infill mode #####\n\n");
|
LOG_TEE("\n##### Infill mode #####\n\n");
|
||||||
if (params.infill) {
|
|
||||||
printf("\n************\n");
|
|
||||||
printf("no need to specify '--infill', always running infill\n");
|
|
||||||
printf("************\n\n");
|
|
||||||
}
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
const char *control_message;
|
const char *control_message;
|
||||||
if (params.multiline_input) {
|
if (params.multiline_input) {
|
||||||
|
@ -345,8 +341,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::vector<llama_token> embd;
|
std::vector<llama_token> embd;
|
||||||
|
|
||||||
smpl = gpt_sampler_init(model, sparams);
|
|
||||||
|
|
||||||
while (n_remain != 0 || params.interactive) {
|
while (n_remain != 0 || params.interactive) {
|
||||||
// predict
|
// predict
|
||||||
if (!embd.empty()) {
|
if (!embd.empty()) {
|
||||||
|
|
|
@ -414,8 +414,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||||
// sample the most likely token
|
// sample the most likely token
|
||||||
const auto new_token_id = llama_sampler_sample(sampler, context, -1);
|
const auto new_token_id = llama_sampler_sample(sampler, context, -1);
|
||||||
|
|
||||||
llama_sampler_accept(sampler, new_token_id);
|
|
||||||
|
|
||||||
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
|
|
@ -152,8 +152,6 @@ actor LlamaContext {
|
||||||
|
|
||||||
new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
|
new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)
|
||||||
|
|
||||||
llama_sampler_accept(sampling, new_token_id)
|
|
||||||
|
|
||||||
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
|
||||||
print("\n")
|
print("\n")
|
||||||
is_done = true
|
is_done = true
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
#include "ggml.h"
|
#include "arg.h"
|
||||||
|
#include "base64.hpp"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
#include "base64.hpp"
|
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
@ -278,8 +279,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_LLAVA, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,9 +1,11 @@
|
||||||
#include "ggml.h"
|
#include "arg.h"
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "clip.h"
|
#include "clip.h"
|
||||||
#include "llava.h"
|
#include "llava.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
|
@ -253,8 +255,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, show_additional_info);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, show_additional_info)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
@ -36,8 +38,7 @@ struct ngram_container {
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
#include "ggml.h"
|
#include "arg.h"
|
||||||
#include "llama.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
@ -13,8 +14,7 @@
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -40,4 +40,6 @@ int main(int argc, char ** argv){
|
||||||
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
||||||
|
|
||||||
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
||||||
|
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
#include "ggml.h"
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
|
||||||
#include "log.h"
|
#include "log.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
@ -15,8 +16,7 @@
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "ngram-cache.h"
|
#include "ngram-cache.h"
|
||||||
|
#include "sampling.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
@ -12,8 +14,7 @@
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
@ -138,9 +139,7 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_MAIN, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -192,8 +191,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
|
|
||||||
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
|
||||||
|
|
||||||
LOG("%s: llama backend init\n", __func__);
|
LOG("%s: llama backend init\n", __func__);
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
@ -471,8 +468,10 @@ int main(int argc, char ** argv) {
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_TEE("sampling seed: %u\n", gpt_sampler_get_seed(smpl));
|
||||||
LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
|
LOG_TEE("sampling params: \n%s\n", sparams.print().c_str());
|
||||||
LOG_TEE(" sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
|
LOG_TEE("sampler constr: \n%s\n", gpt_sampler_print(smpl).c_str());
|
||||||
|
|
||||||
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
|
|
||||||
// group-attention state
|
// group-attention state
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
// A basic application simulating a server with multiple clients.
|
// A basic application simulating a server with multiple clients.
|
||||||
// The clients submit requests to the server and they are processed in parallel.
|
// The clients submit requests to the server and they are processed in parallel.
|
||||||
|
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
@ -100,8 +102,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -19,8 +20,7 @@ int main(int argc, char ** argv) {
|
||||||
params.n_keep = 32;
|
params.n_keep = 32;
|
||||||
params.i_pos = -1;
|
params.i_pos = -1;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PASSKEY, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -220,8 +220,6 @@ int main(int argc, char ** argv) {
|
||||||
{
|
{
|
||||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
||||||
|
|
||||||
llama_sampler_accept(smpl, new_token_id);
|
|
||||||
|
|
||||||
// is it an end of generation?
|
// is it an end of generation?
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
|
@ -1,18 +1,19 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <array>
|
||||||
|
#include <atomic>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <fstream>
|
||||||
|
#include <mutex>
|
||||||
|
#include <random>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <mutex>
|
|
||||||
#include <atomic>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <array>
|
|
||||||
#include <fstream>
|
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
@ -1967,8 +1968,7 @@ int main(int argc, char ** argv) {
|
||||||
params.n_ctx = 512;
|
params.n_ctx = 512;
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_PERPLEXITY);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2007,8 +2007,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
print_build_info();
|
print_build_info();
|
||||||
|
|
||||||
LOG_TEE("%s: seed = %u\n", __func__, params.sparams.seed);
|
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,8 @@ As the models are currently fully loaded into memory, you will need adequate dis
|
||||||
|
|
||||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||||
|
|
||||||
|
The quantization formats `Q4_0_4_4`, `Q4_0_4_8` and `Q4_0_8_8` are block interleaved variants of the `Q4_0` format, providing a data layout that is better suited for specific implementations of optimized mulmat kernels. Since these formats differ only in data layout, they have the same quantized size as the `Q4_0` format.
|
||||||
|
|
||||||
*(outdated)*
|
*(outdated)*
|
||||||
|
|
||||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 |
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -111,8 +112,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_RETRIEVAL, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,20 +10,21 @@ This can be used for distributed LLM inference with `llama.cpp` in the following
|
||||||
|
|
||||||
```mermaid
|
```mermaid
|
||||||
flowchart TD
|
flowchart TD
|
||||||
rpcb---|TCP|srva
|
rpcb<-->|TCP|srva
|
||||||
rpcb---|TCP|srvb
|
rpcb<-->|TCP|srvb
|
||||||
rpcb-.-|TCP|srvn
|
rpcb<-.->|TCP|srvn
|
||||||
subgraph hostn[Host N]
|
subgraph hostn[Host N]
|
||||||
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"]
|
srvn[rpc-server]<-.->backend3["Backend (CUDA,Metal,etc.)"]
|
||||||
end
|
end
|
||||||
subgraph hostb[Host B]
|
subgraph hostb[Host B]
|
||||||
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"]
|
srvb[rpc-server]<-->backend2["Backend (CUDA,Metal,etc.)"]
|
||||||
end
|
end
|
||||||
subgraph hosta[Host A]
|
subgraph hosta[Host A]
|
||||||
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"]
|
srva[rpc-server]<-->backend["Backend (CUDA,Metal,etc.)"]
|
||||||
end
|
end
|
||||||
subgraph host[Main Host]
|
subgraph host[Main Host]
|
||||||
ggml[llama.cpp]---rpcb[RPC backend]
|
local["Backend (CUDA,Metal,etc.)"]<-->ggml[llama-cli]
|
||||||
|
ggml[llama-cli]<-->rpcb[RPC backend]
|
||||||
end
|
end
|
||||||
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
|
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5
|
||||||
```
|
```
|
||||||
|
@ -62,17 +63,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
|
||||||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||||
|
|
||||||
|
|
||||||
On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
|
On the main host build `llama.cpp` for the local backend and add `-DGGML_RPC=ON` to the build options.
|
||||||
|
Finally, when running `llama-cli`, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
||||||
```bash
|
|
||||||
mkdir build-rpc
|
|
||||||
cd build-rpc
|
|
||||||
cmake .. -DGGML_RPC=ON
|
|
||||||
cmake --build . --config Release
|
|
||||||
```
|
|
||||||
|
|
||||||
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
$ bin/llama-cli -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99
|
||||||
```
|
```
|
||||||
|
|
||||||
|
This way you can offload model layers to both local and remote devices.
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -10,8 +11,7 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = "The quick brown fox";
|
params.prompt = "The quick brown fox";
|
||||||
params.sparams.seed = 1234;
|
params.sparams.seed = 1234;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,8 +74,6 @@ int main(int argc, char ** argv) {
|
||||||
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
||||||
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
||||||
|
|
||||||
llama_sampler_accept(smpl, next_token);
|
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
result0 += next_token_str;
|
result0 += next_token_str;
|
||||||
|
|
||||||
|
@ -132,8 +130,6 @@ int main(int argc, char ** argv) {
|
||||||
auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
|
auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
|
||||||
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
||||||
|
|
||||||
llama_sampler_accept(smpl2, next_token);
|
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
result1 += next_token_str;
|
result1 += next_token_str;
|
||||||
|
|
||||||
|
@ -222,8 +218,6 @@ int main(int argc, char ** argv) {
|
||||||
auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
|
auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
|
||||||
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
||||||
|
|
||||||
llama_sampler_accept(smpl3, next_token);
|
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
result2 += next_token_str;
|
result2 += next_token_str;
|
||||||
|
|
||||||
|
|
|
@ -23,36 +23,32 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| `--version` | show version and build info |
|
| `--version` | show version and build info |
|
||||||
| `-v, --verbose` | print verbose information |
|
| `-v, --verbose` | print verbose information |
|
||||||
| `--verbosity N` | set specific verbosity level (default: 0) |
|
| `--verbosity N` | set specific verbosity level (default: 0) |
|
||||||
| `--verbose-prompt` | print a verbose prompt before generation (default: false) |
|
|
||||||
| `--no-display-prompt` | don't print prompt at generation (default: false) |
|
|
||||||
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
|
|
||||||
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
| `-t, --threads N` | number of threads to use during generation (default: -1)<br/>(env: LLAMA_ARG_THREADS) |
|
||||||
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
|
| `-tb, --threads-batch N` | number of threads to use during batch and prompt processing (default: same as --threads) |
|
||||||
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
|
| `-C, --cpu-mask M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: "") |
|
||||||
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
|
| `-Cr, --cpu-range lo-hi` | range of CPUs for affinity. Complements --cpu-mask |
|
||||||
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
|
| `--cpu-strict <0\|1>` | use strict CPU placement (default: 0)<br/> |
|
||||||
|
| `--prio N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||||
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
|
| `--poll <0...100>` | use polling level to wait for work (0 - no polling, default: 50)<br/> |
|
||||||
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
|
| `-Cb, --cpu-mask-batch M` | CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask) |
|
||||||
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
|
| `-Crb, --cpu-range-batch lo-hi` | ranges of CPUs for affinity. Complements --cpu-mask-batch |
|
||||||
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
|
| `--cpu-strict-batch <0\|1>` | use strict CPU placement (default: same as --cpu-strict) |
|
||||||
|
| `--prio-batch N` | set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: 0)<br/> |
|
||||||
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
| `--poll-batch <0\|1>` | use polling to wait for work (default: same as --poll) |
|
||||||
| `-lcs, --lookup-cache-static FNAME` | path to static lookup cache to use for lookup decoding (not updated by generation) |
|
|
||||||
| `-lcd, --lookup-cache-dynamic FNAME` | path to dynamic lookup cache to use for lookup decoding (updated by generation) |
|
|
||||||
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
| `-c, --ctx-size N` | size of the prompt context (default: 0, 0 = loaded from model)<br/>(env: LLAMA_ARG_CTX_SIZE) |
|
||||||
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
| `-n, --predict, --n-predict N` | number of tokens to predict (default: -1, -1 = infinity, -2 = until context filled)<br/>(env: LLAMA_ARG_N_PREDICT) |
|
||||||
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
| `-b, --batch-size N` | logical maximum batch size (default: 2048)<br/>(env: LLAMA_ARG_BATCH) |
|
||||||
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
| `-ub, --ubatch-size N` | physical maximum batch size (default: 512)<br/>(env: LLAMA_ARG_UBATCH) |
|
||||||
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
| `--keep N` | number of tokens to keep from the initial prompt (default: 0, -1 = all) |
|
||||||
| `--chunks N` | max number of chunks to process (default: -1, -1 = all) |
|
|
||||||
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
| `-fa, --flash-attn` | enable Flash Attention (default: disabled)<br/>(env: LLAMA_ARG_FLASH_ATTN) |
|
||||||
| `-p, --prompt PROMPT` | prompt to start generation with |
|
| `-p, --prompt PROMPT` | prompt to start generation with |
|
||||||
| `-f, --file FNAME` | a file containing the prompt (default: none) |
|
| `-f, --file FNAME` | a file containing the prompt (default: none) |
|
||||||
| `--in-file FNAME` | an input file (repeat to specify multiple files) |
|
|
||||||
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
|
| `-bf, --binary-file FNAME` | binary file containing the prompt (default: none) |
|
||||||
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
|
||||||
| `--no-escape` | do not process escape sequences |
|
| `--no-escape` | do not process escape sequences |
|
||||||
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
|
||||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typical_p;top_p;min_p;temperature) |
|
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
|
||||||
|
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for < 0) |
|
||||||
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
||||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||||
| `--penalize-nl` | penalize newline tokens (default: false) |
|
| `--penalize-nl` | penalize newline tokens (default: false) |
|
||||||
|
@ -92,13 +88,12 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
|
| `-ctv, --cache-type-v TYPE` | KV cache data type for V (default: f16) |
|
||||||
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: -1.0, < 0 - disabled)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
|
||||||
| `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
|
| `-np, --parallel N` | number of parallel sequences to decode (default: 1) |
|
||||||
| `-ns, --sequences N` | number of sequences to decode (default: 1) |
|
|
||||||
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
|
||||||
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
|
||||||
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
|
| `--mlock` | force system to keep model in RAM rather than swapping or compressing |
|
||||||
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
|
| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock) |
|
||||||
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
|
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437 |
|
||||||
| `-ngl, --gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
| `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
|
||||||
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
|
| `-sm, --split-mode {none,layer,row}` | how to split the model across multiple GPUs, one of:<br/>- none: use one GPU only<br/>- layer (default): split layers and KV across GPUs<br/>- row: split rows across GPUs |
|
||||||
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
|
| `-ts, --tensor-split N0,N1,N2,...` | fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1 |
|
||||||
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
|
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0) |
|
||||||
|
@ -109,7 +104,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
|
||||||
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
|
| `--control-vector-scaled FNAME SCALE` | add a control vector with user defined scaling SCALE<br/>note: this argument can be repeated to add multiple scaled control vectors |
|
||||||
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
| `--control-vector-layer-range START END` | layer range to apply the control vector(s) to, start and end inclusive |
|
||||||
| `-a, --alias STRING` | set alias for model name (to be used by REST API)<br/>(env: LLAMA_ARG_MODEL) |
|
| `-a, --alias STRING` | set alias for model name (to be used by REST API) |
|
||||||
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
| `-m, --model FNAME` | model path (default: `models/$filename` with filename from `--hf-file` or `--model-url` if set, otherwise models/7B/ggml-model-f16.gguf)<br/>(env: LLAMA_ARG_MODEL) |
|
||||||
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
| `-mu, --model-url MODEL_URL` | model download url (default: unused)<br/>(env: LLAMA_ARG_MODEL_URL) |
|
||||||
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
| `-hfr, --hf-repo REPO` | Hugging Face model repository (default: unused)<br/>(env: LLAMA_ARG_HF_REPO) |
|
||||||
|
@ -123,7 +118,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
||||||
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
|
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key |
|
||||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
|
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate |
|
||||||
| `--timeout N` | server read/write timeout in seconds (default: 600) |
|
| `-to, --timeout N` | server read/write timeout in seconds (default: 600) |
|
||||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||||
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
||||||
| `--log-format {text, json}` | log output format: json or text (default: json) |
|
| `--log-format {text, json}` | log output format: json or text (default: json) |
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
#include "utils.hpp"
|
#include "utils.hpp"
|
||||||
|
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -613,7 +615,7 @@ struct server_context {
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
llama_batch batch;
|
llama_batch batch = {};
|
||||||
|
|
||||||
bool clean_kv_cache = true;
|
bool clean_kv_cache = true;
|
||||||
bool add_bos_token = true;
|
bool add_bos_token = true;
|
||||||
|
@ -1264,6 +1266,7 @@ struct server_context {
|
||||||
{"n_predict", slot.n_predict}, // Server configured n_predict
|
{"n_predict", slot.n_predict}, // Server configured n_predict
|
||||||
{"model", params.model_alias},
|
{"model", params.model_alias},
|
||||||
{"seed", slot.sparams.seed},
|
{"seed", slot.sparams.seed},
|
||||||
|
{"seed_cur", slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0},
|
||||||
{"temperature", slot.sparams.temp},
|
{"temperature", slot.sparams.temp},
|
||||||
{"dynatemp_range", slot.sparams.dynatemp_range},
|
{"dynatemp_range", slot.sparams.dynatemp_range},
|
||||||
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
|
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
|
||||||
|
@ -2423,8 +2426,7 @@ int main(int argc, char ** argv) {
|
||||||
// own arguments required by this example
|
// own arguments required by this example
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SERVER);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -18,8 +19,7 @@ int main(int argc, char ** argv) {
|
||||||
params.prompt = "Hello my name is";
|
params.prompt = "Hello my name is";
|
||||||
params.n_predict = 32;
|
params.n_predict = 32;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON, print_usage);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -118,8 +118,6 @@ int main(int argc, char ** argv) {
|
||||||
{
|
{
|
||||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);
|
||||||
|
|
||||||
llama_sampler_accept(smpl, new_token_id);
|
|
||||||
|
|
||||||
// is it an end of generation?
|
// is it an end of generation?
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
|
|
@ -1,11 +1,13 @@
|
||||||
|
#include "arg.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
#include "sampling.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <cmath>
|
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <set>
|
#include <set>
|
||||||
|
#include <random>
|
||||||
|
|
||||||
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
|
||||||
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
|
||||||
|
@ -27,8 +29,7 @@ struct seq_draft {
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_SPECULATIVE);
|
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
||||||
if (!gpt_params_parse(argc, argv, params, options)) {
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
20
flake.lock
generated
20
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1725024810,
|
"lastModified": 1725234343,
|
||||||
"narHash": "sha256-ODYRm8zHfLTH3soTFWE452ydPYz2iTvr9T8ftDMUQ3E=",
|
"narHash": "sha256-+ebgonl3NbiKD2UD0x4BszCZQ6sTfL4xioaM49o5B3Y=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "af510d4a62d071ea13925ce41c95e3dec816c01d",
|
"rev": "567b938d64d4b4112ee253b9274472dc3a346eb6",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1724819573,
|
"lastModified": 1725634671,
|
||||||
"narHash": "sha256-GnR7/ibgIH1vhoy8cYdmXE6iyZqKqFxQSVkFgosBh6w=",
|
"narHash": "sha256-v3rIhsJBOMLR8e/RNWxr828tB+WywYIoajrZKFM+0Gg=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "71e91c409d1e654808b2621f28a327acfdad8dc2",
|
"rev": "574d1eac1c200690e27b8eb4e24887f8df7ac27c",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -36,14 +36,14 @@
|
||||||
},
|
},
|
||||||
"nixpkgs-lib": {
|
"nixpkgs-lib": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1722555339,
|
"lastModified": 1725233747,
|
||||||
"narHash": "sha256-uFf2QeW7eAHlYXuDktm9c25OxOyCoUOQmh5SZ9amE5Q=",
|
"narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
|
||||||
"type": "tarball",
|
"type": "tarball",
|
||||||
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
|
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"type": "tarball",
|
"type": "tarball",
|
||||||
"url": "https://github.com/NixOS/nixpkgs/archive/a5d394176e64ab29c852d03346c1fc9b0b7d33eb.tar.gz"
|
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"root": {
|
"root": {
|
||||||
|
|
|
@ -681,8 +681,8 @@ extern "C" {
|
||||||
|
|
||||||
struct ggml_hash_set {
|
struct ggml_hash_set {
|
||||||
size_t size;
|
size_t size;
|
||||||
ggml_bitset_t * used;
|
ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
||||||
struct ggml_tensor ** keys;
|
struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
|
||||||
};
|
};
|
||||||
|
|
||||||
// computation graph
|
// computation graph
|
||||||
|
@ -1272,7 +1272,7 @@ extern "C" {
|
||||||
size_t nb1,
|
size_t nb1,
|
||||||
size_t nb2,
|
size_t nb2,
|
||||||
size_t nb3,
|
size_t nb3,
|
||||||
size_t offset);
|
size_t offset); // in bytes
|
||||||
|
|
||||||
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_set_inplace(
|
GGML_API struct ggml_tensor * ggml_set_inplace(
|
||||||
|
@ -1282,19 +1282,19 @@ extern "C" {
|
||||||
size_t nb1,
|
size_t nb1,
|
||||||
size_t nb2,
|
size_t nb2,
|
||||||
size_t nb3,
|
size_t nb3,
|
||||||
size_t offset);
|
size_t offset); // in bytes
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_1d(
|
GGML_API struct ggml_tensor * ggml_set_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
size_t offset);
|
size_t offset); // in bytes
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
size_t offset);
|
size_t offset); // in bytes
|
||||||
|
|
||||||
// b -> view(a,offset,nb1,nb2,3), return modified a
|
// b -> view(a,offset,nb1,nb2,3), return modified a
|
||||||
GGML_API struct ggml_tensor * ggml_set_2d(
|
GGML_API struct ggml_tensor * ggml_set_2d(
|
||||||
|
@ -1302,7 +1302,7 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
size_t nb1,
|
size_t nb1,
|
||||||
size_t offset);
|
size_t offset); // in bytes
|
||||||
|
|
||||||
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
||||||
|
@ -1310,7 +1310,7 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
size_t nb1,
|
size_t nb1,
|
||||||
size_t offset);
|
size_t offset); // in bytes
|
||||||
|
|
||||||
// a -> b, return view(b)
|
// a -> b, return view(b)
|
||||||
GGML_API struct ggml_tensor * ggml_cpy(
|
GGML_API struct ggml_tensor * ggml_cpy(
|
||||||
|
|
|
@ -827,6 +827,10 @@ GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const
|
||||||
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
||||||
|
case GGML_OP_ROPE_BACK:
|
||||||
|
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
||||||
|
case GGML_OP_IM2COL_BACK:
|
||||||
|
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||||
default:
|
default:
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,7 +32,7 @@ DOXYFILE_ENCODING = UTF-8
|
||||||
# title of most generated pages and in a few other places.
|
# title of most generated pages and in a few other places.
|
||||||
# The default value is: My Project.
|
# The default value is: My Project.
|
||||||
|
|
||||||
PROJECT_NAME = "llama.cpp"
|
PROJECT_NAME = "ggml"
|
||||||
|
|
||||||
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
|
# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
|
||||||
# could be handy for archiving the generated documentation or if some version
|
# could be handy for archiving the generated documentation or if some version
|
||||||
|
@ -44,7 +44,7 @@ PROJECT_NUMBER =
|
||||||
# for a project that appears at the top of each page and should give viewer a
|
# for a project that appears at the top of each page and should give viewer a
|
||||||
# quick idea about the purpose of the project. Keep the description short.
|
# quick idea about the purpose of the project. Keep the description short.
|
||||||
|
|
||||||
PROJECT_BRIEF = "llama inference engine"
|
PROJECT_BRIEF = "Tensor library for machine learning"
|
||||||
|
|
||||||
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
|
# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
|
||||||
# in the documentation. The maximum height of the logo should not exceed 55
|
# in the documentation. The maximum height of the logo should not exceed 55
|
||||||
|
|
|
@ -27,6 +27,7 @@
|
||||||
#include "ggml-cuda/rope.cuh"
|
#include "ggml-cuda/rope.cuh"
|
||||||
#include "ggml-cuda/scale.cuh"
|
#include "ggml-cuda/scale.cuh"
|
||||||
#include "ggml-cuda/softmax.cuh"
|
#include "ggml-cuda/softmax.cuh"
|
||||||
|
#include "ggml-cuda/sum.cuh"
|
||||||
#include "ggml-cuda/sumrows.cuh"
|
#include "ggml-cuda/sumrows.cuh"
|
||||||
#include "ggml-cuda/tsembd.cuh"
|
#include "ggml-cuda/tsembd.cuh"
|
||||||
#include "ggml-cuda/unary.cuh"
|
#include "ggml-cuda/unary.cuh"
|
||||||
|
@ -2180,6 +2181,7 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
ggml_cuda_dup(ctx, dst);
|
ggml_cuda_dup(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
|
case GGML_OP_ADD1: // TODO: more efficient implementation
|
||||||
ggml_cuda_op_add(ctx, dst);
|
ggml_cuda_op_add(ctx, dst);
|
||||||
break;
|
break;
|
||||||
case GGML_OP_SUB:
|
case GGML_OP_SUB:
|
||||||
|
@ -2196,6 +2198,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
break;
|
break;
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(dst)) {
|
switch (ggml_get_unary_op(dst)) {
|
||||||
|
case GGML_UNARY_OP_NEG:
|
||||||
|
ggml_cuda_op_neg(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
ggml_cuda_op_gelu(ctx, dst);
|
ggml_cuda_op_gelu(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
@ -2304,6 +2309,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
ggml_cuda_op_pool2d(ctx, dst);
|
ggml_cuda_op_pool2d(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_SUM:
|
||||||
|
ggml_cuda_op_sum(ctx, dst);
|
||||||
|
break;
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
ggml_cuda_op_sum_rows(ctx, dst);
|
ggml_cuda_op_sum_rows(ctx, dst);
|
||||||
break;
|
break;
|
||||||
|
@ -2544,7 +2552,11 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
||||||
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
GGML_CUDA_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
||||||
|
@ -2748,6 +2760,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(op)) {
|
switch (ggml_get_unary_op(op)) {
|
||||||
|
case GGML_UNARY_OP_NEG:
|
||||||
case GGML_UNARY_OP_GELU:
|
case GGML_UNARY_OP_GELU:
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
case GGML_UNARY_OP_RELU:
|
case GGML_UNARY_OP_RELU:
|
||||||
|
@ -2877,6 +2890,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_TRANSPOSE:
|
case GGML_OP_TRANSPOSE:
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
case GGML_OP_ADD:
|
case GGML_OP_ADD:
|
||||||
|
case GGML_OP_ADD1:
|
||||||
case GGML_OP_SUB:
|
case GGML_OP_SUB:
|
||||||
case GGML_OP_MUL:
|
case GGML_OP_MUL:
|
||||||
case GGML_OP_DIV:
|
case GGML_OP_DIV:
|
||||||
|
@ -2887,14 +2901,18 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_SIN:
|
case GGML_OP_SIN:
|
||||||
case GGML_OP_COS:
|
case GGML_OP_COS:
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
|
return true;
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
|
return op->src[0]->type != GGML_TYPE_BF16;
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
return ggml_is_contiguous(op->src[0]);
|
return ggml_is_contiguous(op->src[0]);
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
|
return op->src[0]->type == GGML_TYPE_F16;
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
|
case GGML_OP_SUM:
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
case GGML_OP_ACC:
|
case GGML_OP_ACC:
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
#include "cross-entropy-loss.cuh"
|
#include "cross-entropy-loss.cuh"
|
||||||
#include "sumrows.cuh"
|
#include "sum.cuh"
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
@ -102,5 +102,5 @@ void ggml_cuda_cross_entropy_loss(ggml_backend_cuda_context & ctx, ggml_tensor *
|
||||||
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
cross_entropy_loss_f32<<<blocks_num, blocks_dim, shmem, stream>>>(src0_d, src1_d, dst_tmp.ptr, ne00, nrows);
|
||||||
|
|
||||||
// Combine results from individual blocks:
|
// Combine results from individual blocks:
|
||||||
sum_rows_f32_cuda(dst_tmp.ptr, dst_d, blocks_num.x, 1, stream);
|
sum_f32_cuda(pool, dst_tmp.ptr, dst_d, blocks_num.x, stream);
|
||||||
}
|
}
|
||||||
|
|
|
@ -152,7 +152,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
|
||||||
} \
|
} \
|
||||||
|
|
||||||
static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
ggml_tensor * Q = dst->src[1];
|
ggml_tensor * Q = dst->src[0];
|
||||||
ggml_tensor * K = dst->src[1];
|
ggml_tensor * K = dst->src[1];
|
||||||
ggml_tensor * V = dst->src[2];
|
ggml_tensor * V = dst->src[2];
|
||||||
|
|
||||||
|
@ -227,7 +227,7 @@ static void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, gg
|
||||||
} \
|
} \
|
||||||
|
|
||||||
static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
static void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
ggml_tensor * Q = dst->src[1];
|
ggml_tensor * Q = dst->src[0];
|
||||||
ggml_tensor * K = dst->src[1];
|
ggml_tensor * K = dst->src[1];
|
||||||
ggml_tensor * V = dst->src[2];
|
ggml_tensor * V = dst->src[2];
|
||||||
|
|
||||||
|
|
43
ggml/src/ggml-cuda/sum.cu
Normal file
43
ggml/src/ggml-cuda/sum.cu
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||||
|
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
|
||||||
|
// For this reason CUB must be included BEFORE anything else.
|
||||||
|
#include <cub/cub.cuh>
|
||||||
|
using namespace cub;
|
||||||
|
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||||
|
|
||||||
|
#include "sumrows.cuh"
|
||||||
|
#include "sum.cuh"
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
|
||||||
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||||
|
size_t tmp_size = 0;
|
||||||
|
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
|
||||||
|
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
|
||||||
|
DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
|
||||||
|
#else
|
||||||
|
// Use (inefficient) sum_rows implementation as a fallback.
|
||||||
|
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
|
||||||
|
sum_rows_f32_cuda(x, dst, ne, 1, stream);
|
||||||
|
GGML_UNUSED(pool);
|
||||||
|
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
|
const float * src0_d = (const float *) src0->data;
|
||||||
|
float * dst_d = (float *) dst->data;
|
||||||
|
|
||||||
|
const int64_t ne = ggml_nelements(src0);
|
||||||
|
|
||||||
|
ggml_cuda_pool & pool = ctx.pool();
|
||||||
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
|
||||||
|
}
|
5
ggml/src/ggml-cuda/sum.cuh
Normal file
5
ggml/src/ggml-cuda/sum.cuh
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
|
||||||
|
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream);
|
||||||
|
|
||||||
|
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
@ -1,5 +1,15 @@
|
||||||
#include "unary.cuh"
|
#include "unary.cuh"
|
||||||
|
|
||||||
|
static __global__ void neg_f32(const float * x, float * dst, const int k) {
|
||||||
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (i >= k) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
dst[i] = -x[i];
|
||||||
|
}
|
||||||
|
|
||||||
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
static __global__ void gelu_f32(const float * x, float * dst, const int k) {
|
||||||
const float GELU_COEF_A = 0.044715f;
|
const float GELU_COEF_A = 0.044715f;
|
||||||
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||||
|
@ -119,6 +129,11 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
|
||||||
dst[i] = cosf(x[i]);
|
dst[i] = cosf(x[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||||
|
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
|
||||||
|
neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||||
|
}
|
||||||
|
|
||||||
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
|
||||||
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||||
|
@ -184,6 +199,20 @@ static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t
|
||||||
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
const float * src0_d = (const float *)src0->data;
|
||||||
|
float * dst_d = (float *)dst->data;
|
||||||
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
const float * src0_d = (const float *)src0->data;
|
const float * src0_d = (const float *)src0->data;
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include "common.cuh"
|
#include "common.cuh"
|
||||||
|
|
||||||
|
#define CUDA_NEG_BLOCK_SIZE 256
|
||||||
#define CUDA_GELU_BLOCK_SIZE 256
|
#define CUDA_GELU_BLOCK_SIZE 256
|
||||||
#define CUDA_SILU_BLOCK_SIZE 256
|
#define CUDA_SILU_BLOCK_SIZE 256
|
||||||
#define CUDA_TANH_BLOCK_SIZE 256
|
#define CUDA_TANH_BLOCK_SIZE 256
|
||||||
|
@ -12,6 +13,8 @@
|
||||||
#define CUDA_SIN_BLOCK_SIZE 256
|
#define CUDA_SIN_BLOCK_SIZE 256
|
||||||
#define CUDA_COS_BLOCK_SIZE 256
|
#define CUDA_COS_BLOCK_SIZE 256
|
||||||
|
|
||||||
|
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
|
@ -17,8 +17,8 @@
|
||||||
#define GGML_METAL_LOG_WARN(...)
|
#define GGML_METAL_LOG_WARN(...)
|
||||||
#define GGML_METAL_LOG_ERROR(...)
|
#define GGML_METAL_LOG_ERROR(...)
|
||||||
#else
|
#else
|
||||||
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
#define GGML_METAL_LOG_INFO(...) ggml_metal_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
||||||
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
#define GGML_METAL_LOG_WARN(...) ggml_metal_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
||||||
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
#define GGML_METAL_LOG_ERROR(...) ggml_metal_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -799,8 +799,9 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
|
||||||
return ctx->support_simdgroup_reduction;
|
return ctx->support_simdgroup_reduction;
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
case GGML_OP_IM2COL:
|
|
||||||
return true;
|
return true;
|
||||||
|
case GGML_OP_IM2COL:
|
||||||
|
return op->src[0]->type == GGML_TYPE_F16;
|
||||||
case GGML_OP_POOL_1D:
|
case GGML_OP_POOL_1D:
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
return false;
|
return false;
|
||||||
|
@ -3038,8 +3039,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
if (status != MTLCommandBufferStatusCompleted) {
|
if (status != MTLCommandBufferStatusCompleted) {
|
||||||
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||||
if (status == MTLCommandBufferStatusError) {
|
if (status == MTLCommandBufferStatusError) {
|
||||||
NSString * error_code = [command_buffer error].localizedDescription;
|
GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
|
||||||
GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return GGML_STATUS_FAILED;
|
return GGML_STATUS_FAILED;
|
||||||
|
|
|
@ -4009,42 +4009,141 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
if (ggml_sve_cnt_b == QK8_0) {
|
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||||
const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
|
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||||
const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
|
|
||||||
|
|
||||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
const int vector_length = ggml_sve_cnt_b*8;
|
||||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
|
||||||
|
|
||||||
for (; ib + 1 < nb; ib += 2) {
|
// VLA Implementation using switch case
|
||||||
const block_q4_0 * restrict x0 = &x[ib + 0];
|
switch (vector_length) {
|
||||||
const block_q4_0 * restrict x1 = &x[ib + 1];
|
case 128:
|
||||||
const block_q8_0 * restrict y0 = &y[ib + 0];
|
{
|
||||||
const block_q8_0 * restrict y1 = &y[ib + 1];
|
// predicate for activating higher lanes for 4 float32 elements
|
||||||
|
const svbool_t ph4 = svptrue_pat_b32(SV_VL4);
|
||||||
|
|
||||||
// load x
|
for (; ib + 1 < nb; ib += 2) {
|
||||||
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
const block_q4_0 * restrict x0 = &x[ib + 0];
|
||||||
const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
|
const block_q4_0 * restrict x1 = &x[ib + 1];
|
||||||
|
const block_q8_0 * restrict y0 = &y[ib + 0];
|
||||||
|
const block_q8_0 * restrict y1 = &y[ib + 1];
|
||||||
|
|
||||||
// 4-bit -> 8-bit
|
// load x
|
||||||
const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx0r, 0x0F), 0x04));
|
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
||||||
const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(ptruel, svand_n_u8_m(ptrueh, qx1r, 0x0F), 0x04));
|
const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
|
||||||
|
|
||||||
// sub 8
|
// 4-bit -> 8-bit
|
||||||
const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
|
const svint8_t qx0l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx0r, 0x0F));
|
||||||
const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
|
const svint8_t qx0h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx0r, 0x04));
|
||||||
|
const svint8_t qx1l = svreinterpret_s8_u8(svand_n_u8_m(svptrue_b8(), qx1r, 0x0F));
|
||||||
|
const svint8_t qx1h = svreinterpret_s8_u8(svlsr_n_u8_m(svptrue_b8(), qx1r, 0x04));
|
||||||
|
|
||||||
// load y
|
// sub 8
|
||||||
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
const svint8_t qx0ls = svsub_n_s8_x(svptrue_b8(), qx0h, 8);
|
||||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
const svint8_t qx0hs = svsub_n_s8_x(svptrue_b8(), qx0l, 8);
|
||||||
|
const svint8_t qx1ls = svsub_n_s8_x(svptrue_b8(), qx1h, 8);
|
||||||
|
const svint8_t qx1hs = svsub_n_s8_x(svptrue_b8(), qx1l, 8);
|
||||||
|
|
||||||
// dot product
|
// load y
|
||||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
const svint8_t qy0h = svld1_s8(svptrue_b8(), y0->qs);
|
||||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
const svint8_t qy0l = svld1_s8(svptrue_b8(), y0->qs + 16);
|
||||||
}
|
const svint8_t qy1h = svld1_s8(svptrue_b8(), y1->qs);
|
||||||
|
const svint8_t qy1l = svld1_s8(svptrue_b8(), y1->qs + 16);
|
||||||
|
|
||||||
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
// dot product
|
||||||
|
sumv0 = svmla_n_f32_x(ph4, sumv0, svcvt_f32_s32_x(ph4, svadd_x(ph4,
|
||||||
|
svdot_s32(svdup_n_s32(0), qx0ls, qy0l),
|
||||||
|
svdot_s32(svdup_n_s32(0), qx0hs, qy0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
|
sumv1 = svmla_n_f32_x(ph4, sumv1, svcvt_f32_s32_x(ph4, svadd_x(ph4,
|
||||||
|
svdot_s32(svdup_n_s32(0), qx1ls, qy1l),
|
||||||
|
svdot_s32(svdup_n_s32(0), qx1hs, qy1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||||
|
} break;
|
||||||
|
case 256:
|
||||||
|
{
|
||||||
|
// predicate for activating higher lanes for 16 int8 elements
|
||||||
|
const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
|
||||||
|
// predicate for activating lower lanes for 16 int8 elements
|
||||||
|
const svbool_t pl16 = svnot_b_z(svptrue_b8(), ph16);
|
||||||
|
|
||||||
|
for (; ib + 1 < nb; ib += 2) {
|
||||||
|
const block_q4_0 * restrict x0 = &x[ib + 0];
|
||||||
|
const block_q4_0 * restrict x1 = &x[ib + 1];
|
||||||
|
const block_q8_0 * restrict y0 = &y[ib + 0];
|
||||||
|
const block_q8_0 * restrict y1 = &y[ib + 1];
|
||||||
|
|
||||||
|
// load x
|
||||||
|
const svuint8_t qx0r = svld1rq_u8(svptrue_b8(), x0->qs);
|
||||||
|
const svuint8_t qx1r = svld1rq_u8(svptrue_b8(), x1->qs);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit
|
||||||
|
const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
|
||||||
|
const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
|
||||||
|
|
||||||
|
// sub 8
|
||||||
|
const svint8_t qx0s = svsub_n_s8_x(svptrue_b8(), qx0, 8);
|
||||||
|
const svint8_t qx1s = svsub_n_s8_x(svptrue_b8(), qx1, 8);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||||
|
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||||
|
|
||||||
|
// dot product
|
||||||
|
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
|
||||||
|
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
|
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
|
||||||
|
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||||
|
} break;
|
||||||
|
case 512:
|
||||||
|
{
|
||||||
|
// predicate for activating higher lanes for 32 int8 elements
|
||||||
|
const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
|
||||||
|
|
||||||
|
// predicate for activating higher lanes for 16 int8 elements
|
||||||
|
const svbool_t ph16 = svptrue_pat_b8(SV_VL16);
|
||||||
|
// predicate for activating lower lanes for 16 int8 elements from first 32 int8 activated lanes
|
||||||
|
const svbool_t pl16 = svnot_b_z(ph32, ph16);
|
||||||
|
|
||||||
|
for (; ib + 1 < nb; ib += 2) {
|
||||||
|
const block_q4_0 * restrict x0 = &x[ib + 0];
|
||||||
|
const block_q4_0 * restrict x1 = &x[ib + 1];
|
||||||
|
const block_q8_0 * restrict y0 = &y[ib + 0];
|
||||||
|
const block_q8_0 * restrict y1 = &y[ib + 1];
|
||||||
|
|
||||||
|
// load x
|
||||||
|
const svuint8_t qx0r = svld1rq_u8(ph32, x0->qs);
|
||||||
|
const svuint8_t qx1r = svld1rq_u8(ph32, x1->qs);
|
||||||
|
|
||||||
|
// 4-bit -> 8-bit
|
||||||
|
const svint8_t qx0 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx0r, 0x0F), 0x04));
|
||||||
|
const svint8_t qx1 = svreinterpret_s8_u8(svlsr_n_u8_m(pl16, svand_n_u8_m(ph16, qx1r, 0x0F), 0x04));
|
||||||
|
|
||||||
|
// sub 8
|
||||||
|
const svint8_t qx0s = svsub_n_s8_x(ph32, qx0, 8);
|
||||||
|
const svint8_t qx1s = svsub_n_s8_x(ph32, qx1, 8);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const svint8_t qy0 = svld1_s8(ph32, y0->qs);
|
||||||
|
const svint8_t qy1 = svld1_s8(ph32, y1->qs);
|
||||||
|
|
||||||
|
// dot product
|
||||||
|
sumv0 = svmla_n_f32_x(ph32, sumv0, svcvt_f32_s32_x(ph32,
|
||||||
|
svdot_s32(svdup_n_s32(0), qx0s, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
|
sumv1 = svmla_n_f32_x(ph32, sumv1, svcvt_f32_s32_x(ph32,
|
||||||
|
svdot_s32(svdup_n_s32(0), qx1s, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf = svaddv_f32(ph32, svadd_f32_x(ph32, sumv0, sumv1));
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
assert(false && "Unsupported vector length");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
float32x4_t sumv1 = vdupq_n_f32(0.0f);
|
||||||
|
@ -5494,29 +5593,124 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_SVE)
|
#if defined(__ARM_FEATURE_SVE)
|
||||||
if (ggml_sve_cnt_b == QK8_0) {
|
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
||||||
svfloat32_t sumv0 = svdup_n_f32(0.0f);
|
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
||||||
svfloat32_t sumv1 = svdup_n_f32(0.0f);
|
|
||||||
|
|
||||||
for (; ib + 1 < nb; ib += 2) {
|
const int vector_length = ggml_sve_cnt_b*8;
|
||||||
const block_q8_0 * restrict x0 = &x[ib + 0];
|
|
||||||
const block_q8_0 * restrict x1 = &x[ib + 1];
|
|
||||||
const block_q8_0 * restrict y0 = &y[ib + 0];
|
|
||||||
const block_q8_0 * restrict y1 = &y[ib + 1];
|
|
||||||
|
|
||||||
// load x
|
//VLA Implemenation for SVE
|
||||||
const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
|
switch (vector_length) {
|
||||||
const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
|
case 128:
|
||||||
|
{
|
||||||
|
// predicate for activating lanes for 16 Int8 elements
|
||||||
|
const svbool_t ph16 = svptrue_pat_b8 (SV_VL16);
|
||||||
|
const svbool_t pl16 = svptrue_pat_b32(SV_VL4);
|
||||||
|
|
||||||
// load y
|
for (; ib + 1 < nb; ib += 2) {
|
||||||
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
const block_q8_0 * restrict x0 = &x[ib + 0];
|
||||||
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
const block_q8_0 * restrict x1 = &x[ib + 1];
|
||||||
|
const block_q8_0 * restrict y0 = &y[ib + 0];
|
||||||
|
const block_q8_0 * restrict y1 = &y[ib + 1];
|
||||||
|
|
||||||
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
// load x
|
||||||
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
const svint8_t qx0_0 = svld1_s8(ph16, x0->qs);
|
||||||
}
|
const svint8_t qx0_1 = svld1_s8(ph16, x0->qs+16);
|
||||||
|
const svint8_t qx1_0 = svld1_s8(ph16, x1->qs);
|
||||||
|
const svint8_t qx1_1 = svld1_s8(ph16, x1->qs+16);
|
||||||
|
|
||||||
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
// load y
|
||||||
|
const svint8_t qy0_0 = svld1_s8(ph16, y0->qs);
|
||||||
|
const svint8_t qy0_1 = svld1_s8(ph16, y0->qs+16);
|
||||||
|
const svint8_t qy1_0 = svld1_s8(ph16, y1->qs);
|
||||||
|
const svint8_t qy1_1 = svld1_s8(ph16, y1->qs+16);
|
||||||
|
|
||||||
|
sumv0 = svmla_n_f32_x(pl16, sumv0, svcvt_f32_s32_x(pl16, svadd_x(pl16,
|
||||||
|
svdot_s32(svdup_n_s32(0), qx0_0, qy0_0),
|
||||||
|
svdot_s32(svdup_n_s32(0), qx0_1, qy0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
|
sumv1 = svmla_n_f32_x(pl16, sumv1, svcvt_f32_s32_x(pl16, svadd_x(pl16,
|
||||||
|
svdot_s32(svdup_n_s32(0), qx1_0, qy1_0),
|
||||||
|
svdot_s32(svdup_n_s32(0), qx1_1, qy1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf = svaddv_f32(pl16, svadd_f32_x(pl16, sumv0, sumv1));
|
||||||
|
} break;
|
||||||
|
case 256:
|
||||||
|
{
|
||||||
|
//printf("sve256");
|
||||||
|
for (; ib + 1 < nb; ib += 2) {
|
||||||
|
const block_q8_0 * restrict x0 = &x[ib + 0];
|
||||||
|
const block_q8_0 * restrict x1 = &x[ib + 1];
|
||||||
|
const block_q8_0 * restrict y0 = &y[ib + 0];
|
||||||
|
const block_q8_0 * restrict y1 = &y[ib + 1];
|
||||||
|
|
||||||
|
// load x
|
||||||
|
const svint8_t qx0 = svld1_s8(svptrue_b8(), x0->qs);
|
||||||
|
const svint8_t qx1 = svld1_s8(svptrue_b8(), x1->qs);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const svint8_t qy0 = svld1_s8(svptrue_b8(), y0->qs);
|
||||||
|
const svint8_t qy1 = svld1_s8(svptrue_b8(), y1->qs);
|
||||||
|
|
||||||
|
sumv0 = svmla_n_f32_x(svptrue_b32(), sumv0, svcvt_f32_s32_x(svptrue_b32(),
|
||||||
|
svdot_s32(svdup_n_s32(0), qx0, qy0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
|
sumv1 = svmla_n_f32_x(svptrue_b32(), sumv1, svcvt_f32_s32_x(svptrue_b32(),
|
||||||
|
svdot_s32(svdup_n_s32(0), qx1, qy1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf = svaddv_f32(svptrue_b32(), svadd_f32_x(svptrue_b32(), sumv0, sumv1));
|
||||||
|
} break;
|
||||||
|
case 512:
|
||||||
|
{
|
||||||
|
// predicate for activating high 256 bit
|
||||||
|
const svbool_t ph32 = svptrue_pat_b8(SV_VL32);
|
||||||
|
// predicate for activating low 256 bit
|
||||||
|
const svbool_t pl32 = svnot_b_z(svptrue_b8(), ph32);
|
||||||
|
|
||||||
|
// predicate for activating high lanes for 8 float32 elements
|
||||||
|
const svbool_t ph8 = svptrue_pat_b32(SV_VL8);
|
||||||
|
// predicate for activating low lanes for 8 float32 elements
|
||||||
|
const svbool_t pl8 = svnot_b_z(svptrue_b32(), ph8);
|
||||||
|
|
||||||
|
svfloat32_t sumv00 = svdup_n_f32(0.0f);
|
||||||
|
|
||||||
|
for (; ib + 1 < nb; ib += 2) {
|
||||||
|
const block_q8_0 * restrict x0 = &x[ib + 0];
|
||||||
|
const block_q8_0 * restrict x1 = &x[ib + 1];
|
||||||
|
const block_q8_0 * restrict y0 = &y[ib + 0];
|
||||||
|
const block_q8_0 * restrict y1 = &y[ib + 1];
|
||||||
|
|
||||||
|
//load 32 int8_t in first half of vector and put another 32 int8_t in second vector lower bits
|
||||||
|
// and add them to make one 64 element vector
|
||||||
|
// load x
|
||||||
|
const svint8_t qx_32 = svld1_s8(ph32, x0->qs);
|
||||||
|
svint8_t qx_64 = svld1_s8(pl32, x0->qs + 2);
|
||||||
|
|
||||||
|
qx_64 = svadd_s8_x(svptrue_b8(), qx_32, qx_64);
|
||||||
|
|
||||||
|
// load y
|
||||||
|
const svint8_t qy_32 = svld1_s8(ph32, y0->qs);
|
||||||
|
svint8_t qy_64 = svld1_s8(pl32, y0->qs + 2);
|
||||||
|
|
||||||
|
qy_64 = svadd_s8_x(svptrue_b8(), qy_32, qy_64);
|
||||||
|
|
||||||
|
// scale creation
|
||||||
|
const float32_t deq1 = GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d);
|
||||||
|
const float32_t deq2 = GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d);
|
||||||
|
|
||||||
|
// duplicate deq1 in first half of vector and deq2 in second half of vector
|
||||||
|
const svfloat32_t temp = svdup_f32_m(svdup_f32_z(ph8, deq1), pl8, deq2);
|
||||||
|
|
||||||
|
const svfloat32_t sumvt = svcvt_f32_s32_x(svptrue_b32(), svdot_s32(svdup_n_s32(0), qx_64, qy_64));
|
||||||
|
|
||||||
|
sumv00 = svmla_f32_m(svptrue_b32(), sumv00, sumvt, temp);
|
||||||
|
}
|
||||||
|
|
||||||
|
sumf = svaddv_f32(svptrue_b32(), sumv00);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
assert(false && "Unsupported vector length");
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON)
|
||||||
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
float32x4_t sumv0 = vdupq_n_f32(0.0f);
|
||||||
|
|
|
@ -883,15 +883,17 @@ ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rp
|
||||||
}
|
}
|
||||||
result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
|
result->buffer = reinterpret_cast<ggml_backend_buffer_t>(tensor->buffer);
|
||||||
if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
|
if (result->buffer && buffers.find(result->buffer) == buffers.end()) {
|
||||||
return nullptr;
|
result->buffer = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
// require that the tensor data does not go beyond the buffer end
|
if (result->buffer) {
|
||||||
uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
|
// require that the tensor data does not go beyond the buffer end
|
||||||
uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
|
uint64_t tensor_size = (uint64_t) ggml_nbytes(result);
|
||||||
uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
|
uint64_t buffer_start = (uint64_t) ggml_backend_buffer_get_base(result->buffer);
|
||||||
GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
|
uint64_t buffer_size = (uint64_t) ggml_backend_buffer_get_size(result->buffer);
|
||||||
GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
|
GGML_ASSERT(tensor->data + tensor_size >= tensor->data); // check for overflow
|
||||||
|
GGML_ASSERT(tensor->data >= buffer_start && tensor->data + tensor_size <= buffer_start + buffer_size);
|
||||||
|
}
|
||||||
|
|
||||||
result->op = (ggml_op) tensor->op;
|
result->op = (ggml_op) tensor->op;
|
||||||
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
|
for (uint32_t i = 0; i < GGML_MAX_OP_PARAMS / sizeof(int32_t); i++) {
|
||||||
|
@ -1060,7 +1062,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, std::vector<u
|
||||||
const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
|
const rpc_tensor * tensors = (const rpc_tensor *)(input.data() + sizeof(n_nodes) + n_nodes*sizeof(uint64_t) + sizeof(n_tensors));
|
||||||
GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
|
||||||
|
|
||||||
static size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
|
size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ buf_size,
|
/*.mem_size =*/ buf_size,
|
||||||
/*.mem_buffer =*/ NULL,
|
/*.mem_buffer =*/ NULL,
|
||||||
|
|
|
@ -1954,6 +1954,11 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
|
||||||
SYCL_CHECK(
|
SYCL_CHECK(
|
||||||
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
|
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
|
||||||
look_ahead_size, *qptr)));
|
look_ahead_size, *qptr)));
|
||||||
|
if (!ptr) {
|
||||||
|
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, look_ahead_size);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
*actual_size = look_ahead_size;
|
*actual_size = look_ahead_size;
|
||||||
pool_size += look_ahead_size;
|
pool_size += look_ahead_size;
|
||||||
|
|
||||||
|
@ -4350,6 +4355,10 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||||
void * dev_ptr;
|
void * dev_ptr;
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
|
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
|
||||||
size, *stream)));
|
size, *stream)));
|
||||||
|
if (!dev_ptr) {
|
||||||
|
fprintf(stderr, "%s: can't malloc %lu Bytes memory on device", __func__, size);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
|
ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream);
|
||||||
return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
|
return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
|
||||||
}
|
}
|
||||||
|
@ -4570,7 +4579,11 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
|
||||||
*/
|
*/
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
|
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
|
||||||
size, *stream)));
|
size, *stream)));
|
||||||
|
if (!buf) {
|
||||||
|
char err_buf[1024];
|
||||||
|
snprintf(err_buf, 1023, "%s: can't malloc %lu Bytes memory on device", __func__, size);
|
||||||
|
throw std::runtime_error(err_buf);
|
||||||
|
}
|
||||||
// set padding to 0 to avoid possible NaN values
|
// set padding to 0 to avoid possible NaN values
|
||||||
if (size > original_size) {
|
if (size > original_size) {
|
||||||
/*
|
/*
|
||||||
|
@ -5124,13 +5137,17 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
case GGML_OP_SQR:
|
case GGML_OP_SQR:
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
|
return true;
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
|
return op->src[0]->type != GGML_TYPE_BF16;
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
return true;
|
return true;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
return ggml_is_contiguous(op->src[0]);
|
return ggml_is_contiguous(op->src[0]);
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
|
// TODO: add support for the new F32 operations
|
||||||
|
return op->src[0]->type == GGML_TYPE_F16;
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
|
|
|
@ -787,6 +787,9 @@ static vk_submission ggml_vk_create_submission(vk_device& device, vk_queue& q, s
|
||||||
|
|
||||||
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
static void ggml_vk_submit(vk_context& ctx, vk::Fence fence) {
|
||||||
if (ctx->seqs.empty()) {
|
if (ctx->seqs.empty()) {
|
||||||
|
if (fence) {
|
||||||
|
ctx->q->queue.submit({}, fence);
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
|
VK_LOG_DEBUG("ggml_vk_submit(" << ctx << ", " << fence << ")");
|
||||||
|
@ -4616,7 +4619,7 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||||
}, dryrun);
|
}, dryrun);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||||
|
|
||||||
|
@ -4626,10 +4629,10 @@ static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||||
0,
|
0,
|
||||||
0.0f, 0.0f,
|
0.0f, 0.0f,
|
||||||
});
|
}, dryrun);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
|
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||||
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
||||||
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
||||||
|
|
||||||
|
@ -4639,7 +4642,7 @@ static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
||||||
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
||||||
0,
|
0,
|
||||||
0.0f, 0.0f,
|
0.0f, 0.0f,
|
||||||
});
|
}, dryrun);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||||
|
@ -5658,11 +5661,15 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, bool last_node, bool dryrun){
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_tensor* tensor, int tensor_idx, bool use_fence);
|
||||||
|
|
||||||
|
// Returns true if node has enqueued work into the queue, false otherwise
|
||||||
|
// If submit is true the current all operations queued so far are being submitted to Vulkan to overlap cmdlist creation and GPU execution.
|
||||||
|
static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, int node_idx, ggml_tensor *node_begin, int node_idx_begin, bool dryrun, bool last_node, bool submit){
|
||||||
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra;
|
||||||
|
|
||||||
if (ggml_is_empty(node) || extra == nullptr) {
|
if (ggml_is_empty(node) || extra == nullptr) {
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
|
VK_LOG_DEBUG("ggml_vk_build_graph(" << node << ", " << ggml_op_name(node->op) << ")");
|
||||||
|
@ -5679,7 +5686,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
case GGML_OP_PERMUTE:
|
case GGML_OP_PERMUTE:
|
||||||
case GGML_OP_TRANSPOSE:
|
case GGML_OP_TRANSPOSE:
|
||||||
case GGML_OP_NONE:
|
case GGML_OP_NONE:
|
||||||
return;
|
return false;
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
switch (ggml_get_unary_op(node)) {
|
switch (ggml_get_unary_op(node)) {
|
||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
|
@ -5689,7 +5696,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
case GGML_UNARY_OP_TANH:
|
case GGML_UNARY_OP_TANH:
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
|
@ -5726,7 +5733,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
default:
|
default:
|
||||||
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
|
||||||
GGML_ABORT("fatal error");
|
GGML_ABORT("fatal error");
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
vk_context compute_ctx;
|
vk_context compute_ctx;
|
||||||
|
@ -5783,11 +5790,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case GGML_OP_SIN:
|
case GGML_OP_SIN:
|
||||||
ggml_vk_sin(ctx, compute_ctx, src0, node);
|
ggml_vk_sin(ctx, compute_ctx, src0, node, dryrun);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case GGML_OP_COS:
|
case GGML_OP_COS:
|
||||||
ggml_vk_cos(ctx, compute_ctx, src0, node);
|
ggml_vk_cos(ctx, compute_ctx, src0, node, dryrun);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
|
@ -5826,7 +5833,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
|
ggml_vk_unary(ctx, compute_ctx, src0, node, dryrun);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
|
@ -5870,11 +5877,11 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dryrun) {
|
if (dryrun) {
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->tensor_ctxs[node_idx] = compute_ctx;
|
ctx->tensor_ctxs[node_idx] = compute_ctx;
|
||||||
|
@ -5885,14 +5892,34 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||||
last_node = true;
|
last_node = true;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (last_node) {
|
if (submit || last_node) {
|
||||||
ggml_vk_ctx_end(compute_ctx);
|
ggml_vk_ctx_end(compute_ctx);
|
||||||
compute_ctx->exit_tensor_idx = node_idx;
|
|
||||||
|
// TODO probably it'd be better to pass a exit_node flag to ggml_vk_compute_forward
|
||||||
|
if (last_node) {
|
||||||
|
compute_ctx->exit_tensor_idx = node_idx_begin;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
compute_ctx->exit_tensor_idx = -1;
|
||||||
|
}
|
||||||
|
|
||||||
ctx->compute_ctx.reset();
|
ctx->compute_ctx.reset();
|
||||||
|
|
||||||
|
bool ok = ggml_vk_compute_forward(ctx, node_begin, node_idx_begin, false);
|
||||||
|
if (!ok) {
|
||||||
|
if (node->op == GGML_OP_UNARY) {
|
||||||
|
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx){
|
static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor, int tensor_idx, bool use_fence = true){
|
||||||
ggml_tensor_extra_gpu * extra = nullptr;
|
ggml_tensor_extra_gpu * extra = nullptr;
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
|
@ -5960,40 +5987,38 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||||
|
|
||||||
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
|
VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")");
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
||||||
ggml_vk_check_results_0(tensor);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
|
vk_context subctx = ctx->tensor_ctxs[tensor_idx].lock();
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_PERF
|
// always wait for the GPU work to be done for the last submit
|
||||||
std::chrono::steady_clock::time_point start;
|
if (tensor_idx == subctx->exit_tensor_idx) {
|
||||||
#endif // GGML_VULKAN_PERF
|
use_fence = true;
|
||||||
|
}
|
||||||
|
|
||||||
// Only run if ctx hasn't been submitted yet
|
// Only run if ctx hasn't been submitted yet
|
||||||
if (!subctx->seqs.empty()) {
|
if (!subctx->seqs.empty()) {
|
||||||
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
|
ggml_vk_check_results_0(tensor);
|
||||||
|
use_fence = true;
|
||||||
|
#endif
|
||||||
|
|
||||||
// Do staging buffer copies
|
// Do staging buffer copies
|
||||||
for (auto& cpy : subctx->in_memcpys) {
|
for (auto& cpy : subctx->in_memcpys) {
|
||||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_PERF
|
ggml_vk_submit(subctx, use_fence ? ctx->fence : vk::Fence{});
|
||||||
start = std::chrono::steady_clock::now();
|
|
||||||
#endif // GGML_VULKAN_PERF
|
|
||||||
|
|
||||||
ggml_vk_submit(subctx, ctx->fence);
|
if (use_fence) {
|
||||||
|
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
||||||
|
|
||||||
|
ctx->device->device.resetFences({ ctx->fence });
|
||||||
|
}
|
||||||
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
|
ggml_vk_check_results_1(tensor);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tensor_idx == subctx->exit_tensor_idx) {
|
if (tensor_idx == subctx->exit_tensor_idx) {
|
||||||
VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
|
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_PERF
|
|
||||||
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - start);
|
|
||||||
ctx->device->perf_logger->log_timing(tensor, duration.count());
|
|
||||||
#endif // GGML_VULKAN_PERF
|
|
||||||
|
|
||||||
ctx->device->device.resetFences({ ctx->fence });
|
|
||||||
|
|
||||||
// Do staging buffer copies
|
// Do staging buffer copies
|
||||||
for (auto& cpy : subctx->out_memcpys) {
|
for (auto& cpy : subctx->out_memcpys) {
|
||||||
memcpy(cpy.dst, cpy.src, cpy.n);
|
memcpy(cpy.dst, cpy.src, cpy.n);
|
||||||
|
@ -6482,7 +6507,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
||||||
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, 0, true);
|
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, nullptr, 0, true, false, false);
|
||||||
}
|
}
|
||||||
ggml_vk_preallocate_buffers(ctx);
|
ggml_vk_preallocate_buffers(ctx);
|
||||||
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
ggml_pipeline_allocate_descriptor_sets(ctx->device);
|
||||||
|
@ -6497,31 +6522,36 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen
|
||||||
// Reserve tensor context space for all nodes
|
// Reserve tensor context space for all nodes
|
||||||
ctx->tensor_ctxs.resize(cgraph->n_nodes);
|
ctx->tensor_ctxs.resize(cgraph->n_nodes);
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
bool first_node_in_batch = true; // true if next node will be first node in a batch
|
||||||
ggml_vk_build_graph(ctx, cgraph->nodes[i], i, i == last_node, false);
|
int submit_node_idx = 0; // index to first node in a batch
|
||||||
}
|
|
||||||
|
|
||||||
|
// submit work every submit_count node to overlap CPU cmdbuffer generation with GPU execution
|
||||||
|
constexpr int submit_count = 100;
|
||||||
|
int submitted_nodes = 0;
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
if (first_node_in_batch) {
|
||||||
|
submit_node_idx = i;
|
||||||
if (ggml_vk_is_empty(node)) {
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ok = ggml_vk_compute_forward(ctx, node, i);
|
bool submit = (submitted_nodes >= submit_count) || (i == last_node);
|
||||||
if (!ok) {
|
|
||||||
if (node->op == GGML_OP_UNARY) {
|
|
||||||
std::cerr << __func__ << ": error: op not supported UNARY " << node->name << " (" << ggml_unary_op_name(static_cast<ggml_unary_op>(node->op_params[0])) << ")" << std::endl;
|
bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, submit);
|
||||||
} else {
|
|
||||||
std::cerr << __func__ << ": error: op not supported " << node->name << " (" << ggml_op_name(node->op) << ")" << std::endl;
|
if (enqueued) {
|
||||||
|
++submitted_nodes;
|
||||||
|
|
||||||
|
#ifndef GGML_VULKAN_CHECK_RESULTS
|
||||||
|
if (first_node_in_batch) {
|
||||||
|
first_node_in_batch = false;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
|
||||||
else {
|
|
||||||
ggml_vk_check_results_1(node);
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
GGML_ASSERT(ok);
|
}
|
||||||
|
|
||||||
|
if (submit) {
|
||||||
|
first_node_in_batch = true;
|
||||||
|
submitted_nodes = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_VULKAN_PERF
|
#ifdef GGML_VULKAN_PERF
|
||||||
|
@ -6602,6 +6632,7 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_CONT:
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
case GGML_OP_DUP:
|
case GGML_OP_DUP:
|
||||||
{
|
{
|
||||||
|
@ -6642,7 +6673,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
|
||||||
case GGML_OP_COS:
|
case GGML_OP_COS:
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
case GGML_OP_PAD:
|
case GGML_OP_PAD:
|
||||||
case GGML_OP_CONT:
|
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
case GGML_OP_ARGSORT:
|
case GGML_OP_ARGSORT:
|
||||||
|
|
|
@ -3847,7 +3847,7 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
||||||
|
|
||||||
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
|
||||||
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
|
||||||
__func__, cur_end + size_needed, ctx->mem_size);
|
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
|
||||||
assert(false);
|
assert(false);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
@ -5267,6 +5267,7 @@ struct ggml_tensor * ggml_concat(
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
if (a->grad || b->grad) {
|
if (a->grad || b->grad) {
|
||||||
|
GGML_ABORT("fatal error"); // TODO: implement
|
||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5388,6 +5389,7 @@ struct ggml_tensor * ggml_leaky_relu(
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
if (!inplace && (a->grad)) {
|
if (!inplace && (a->grad)) {
|
||||||
|
GGML_ABORT("fatal error"); // TODO: not implemented
|
||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5826,6 +5828,7 @@ static struct ggml_tensor * ggml_set_impl(
|
||||||
// make a view of the destination
|
// make a view of the destination
|
||||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
GGML_ASSERT(offset < (size_t)(1 << 30));
|
||||||
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
|
int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 };
|
||||||
ggml_set_op_params(result, params, sizeof(params));
|
ggml_set_op_params(result, params, sizeof(params));
|
||||||
|
|
||||||
|
@ -6783,14 +6786,12 @@ struct ggml_tensor * ggml_rope_back(
|
||||||
GGML_ASSERT(ggml_is_vector(b));
|
GGML_ASSERT(ggml_is_vector(b));
|
||||||
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
GGML_ASSERT(b->type == GGML_TYPE_I32);
|
||||||
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
GGML_ASSERT(a->ne[2] == b->ne[0]);
|
||||||
GGML_ASSERT(c == NULL && "freq factors not implemented yet");
|
|
||||||
|
|
||||||
GGML_ASSERT((mode & 4) == 0 && "ggml_rope_back() for ChatGLM not implemented yet");
|
|
||||||
|
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
if (a->grad) {
|
if (a->grad) {
|
||||||
is_node = false; // TODO: implement backward
|
GGML_ASSERT(false && "backwards pass not implemented");
|
||||||
|
is_node = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||||
|
@ -6808,6 +6809,7 @@ struct ggml_tensor * ggml_rope_back(
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
result->src[0] = a;
|
result->src[0] = a;
|
||||||
result->src[1] = b;
|
result->src[1] = b;
|
||||||
|
result->src[2] = c;
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
@ -7361,6 +7363,11 @@ struct ggml_tensor * ggml_argsort(
|
||||||
enum ggml_sort_order order) {
|
enum ggml_sort_order order) {
|
||||||
bool is_node = false;
|
bool is_node = false;
|
||||||
|
|
||||||
|
if (a->grad) {
|
||||||
|
GGML_ABORT("fatal error"); // TODO: not implemented
|
||||||
|
is_node = true;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
|
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_I32, GGML_MAX_DIMS, a->ne);
|
||||||
|
|
||||||
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
ggml_set_op_params_i32(result, 0, (int32_t) order);
|
||||||
|
@ -8322,8 +8329,7 @@ static void ggml_compute_forward_dup_same_cont(
|
||||||
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
|
||||||
GGML_ASSERT(src0->type == dst->type);
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
|
|
||||||
const size_t nb00 = src0->nb[0];
|
const size_t nb0 = ggml_type_size(src0->type);
|
||||||
const size_t nb0 = dst->nb[0];
|
|
||||||
|
|
||||||
const int ith = params->ith; // thread index
|
const int ith = params->ith; // thread index
|
||||||
const int nth = params->nth; // number of threads
|
const int nth = params->nth; // number of threads
|
||||||
|
@ -8337,8 +8343,8 @@ static void ggml_compute_forward_dup_same_cont(
|
||||||
if (ie0 < ie1) {
|
if (ie0 < ie1) {
|
||||||
memcpy(
|
memcpy(
|
||||||
((char *) dst->data + ie0*nb0),
|
((char *) dst->data + ie0*nb0),
|
||||||
((char *) src0->data + ie0*nb00),
|
((char *) src0->data + ie0*nb0),
|
||||||
(ie1 - ie0) * ggml_type_size(src0->type));
|
(ie1 - ie0) * nb0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8355,11 +8361,6 @@ static void ggml_compute_forward_dup_f16(
|
||||||
const int ith = params->ith; // thread index
|
const int ith = params->ith; // thread index
|
||||||
const int nth = params->nth; // number of threads
|
const int nth = params->nth; // number of threads
|
||||||
|
|
||||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
|
||||||
ggml_compute_forward_dup_same_cont(params, dst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// parallelize by rows
|
// parallelize by rows
|
||||||
const int nr = ne01;
|
const int nr = ne01;
|
||||||
// number of rows per thread
|
// number of rows per thread
|
||||||
|
@ -8624,11 +8625,6 @@ static void ggml_compute_forward_dup_bf16(
|
||||||
const int ith = params->ith; // thread index
|
const int ith = params->ith; // thread index
|
||||||
const int nth = params->nth; // number of threads
|
const int nth = params->nth; // number of threads
|
||||||
|
|
||||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
|
||||||
ggml_compute_forward_dup_same_cont(params, dst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// parallelize by rows
|
// parallelize by rows
|
||||||
const int nr = ne01;
|
const int nr = ne01;
|
||||||
// number of rows per thread
|
// number of rows per thread
|
||||||
|
@ -8980,11 +8976,6 @@ static void ggml_compute_forward_dup_f32(
|
||||||
const int ith = params->ith; // thread index
|
const int ith = params->ith; // thread index
|
||||||
const int nth = params->nth; // number of threads
|
const int nth = params->nth; // number of threads
|
||||||
|
|
||||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
|
|
||||||
ggml_compute_forward_dup_same_cont(params, dst);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// parallelize by rows
|
// parallelize by rows
|
||||||
const int nr = ne01;
|
const int nr = ne01;
|
||||||
// number of rows per thread
|
// number of rows per thread
|
||||||
|
@ -9294,13 +9285,13 @@ static void ggml_compute_forward_dup_bytes(
|
||||||
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
||||||
GGML_ASSERT(src0->type == dst->type);
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
|
|
||||||
|
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||||
|
|
||||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
|
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
|
||||||
ggml_compute_forward_dup_same_cont(params, dst);
|
ggml_compute_forward_dup_same_cont(params, dst);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
|
||||||
|
|
||||||
const size_t type_size = ggml_type_size(src0->type);
|
const size_t type_size = ggml_type_size(src0->type);
|
||||||
const int ith = params->ith; // thread index
|
const int ith = params->ith; // thread index
|
||||||
const int nth = params->nth; // number of threads
|
const int nth = params->nth; // number of threads
|
||||||
|
@ -10969,9 +10960,6 @@ static void ggml_compute_forward_sum_f32(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(ggml_is_scalar(dst));
|
|
||||||
|
|
||||||
|
|
||||||
assert(ggml_is_scalar(dst));
|
assert(ggml_is_scalar(dst));
|
||||||
assert(src0->nb[0] == sizeof(float));
|
assert(src0->nb[0] == sizeof(float));
|
||||||
|
|
||||||
|
@ -18372,14 +18360,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
if (src0->grad || src1->grad) {
|
if (src0->grad || src1->grad) {
|
||||||
GGML_ASSERT(src0->type == tensor->type);
|
GGML_ASSERT(src0->type == tensor->type);
|
||||||
GGML_ASSERT(tensor->grad->type == tensor->type);
|
GGML_ASSERT(tensor->grad->type == tensor->type);
|
||||||
GGML_ASSERT(tensor->grad->type == src1->grad->type);
|
GGML_ASSERT(!src1->grad || src1->grad->type == tensor->grad->type);
|
||||||
|
|
||||||
tensor_grad_view = ggml_view_4d(ctx,
|
tensor_grad_view = ggml_view_4d(ctx,
|
||||||
tensor->grad,
|
tensor->grad, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
|
||||||
src1->grad->ne[0],
|
|
||||||
src1->grad->ne[1],
|
|
||||||
src1->grad->ne[2],
|
|
||||||
src1->grad->ne[3],
|
|
||||||
nb1, nb2, nb3, offset);
|
nb1, nb2, nb3, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18448,9 +18432,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
|
|
||||||
memcpy(&offset, tensor->op_params, sizeof(offset));
|
memcpy(&offset, tensor->op_params, sizeof(offset));
|
||||||
|
|
||||||
size_t nb1 = tensor->nb[1];
|
size_t nb1 = tensor->nb[1];
|
||||||
size_t nb2 = tensor->nb[2];
|
size_t nb2 = tensor->nb[2];
|
||||||
size_t nb3 = tensor->nb[3];
|
size_t nb3 = tensor->nb[3];
|
||||||
|
|
||||||
if (src0->type != src0->grad->type) {
|
if (src0->type != src0->grad->type) {
|
||||||
// gradient is typically F32, but src0 could be other type
|
// gradient is typically F32, but src0 could be other type
|
||||||
|
@ -19146,7 +19130,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
|
for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
|
||||||
if (src->visited_hash_set.keys[i]) {
|
// copy all hashset keys (tensors) that are in use
|
||||||
|
if (ggml_bitset_get(src->visited_hash_set.used, i)) {
|
||||||
ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
|
ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1127,15 +1127,20 @@ extern "C" {
|
||||||
int32_t n_logit_bias,
|
int32_t n_logit_bias,
|
||||||
const llama_logit_bias * logit_bias);
|
const llama_logit_bias * logit_bias);
|
||||||
|
|
||||||
// Shorthand for:
|
|
||||||
|
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
|
||||||
|
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
|
||||||
|
|
||||||
|
/// @details Sample and accept a token from the idx-th output of the last evaluation
|
||||||
//
|
//
|
||||||
|
// Shorthand for:
|
||||||
// const auto * logits = llama_get_logits_ith(ctx, idx);
|
// const auto * logits = llama_get_logits_ith(ctx, idx);
|
||||||
// llama_token_data_array cur_p = { ... init from logits ... };
|
// llama_token_data_array cur_p = { ... init from logits ... };
|
||||||
// llama_sampler_apply(smpl, &cur_p);
|
// llama_sampler_apply(smpl, &cur_p);
|
||||||
// return cur_p.data[cur_p.selected].id;
|
// auto token = cur_p.data[cur_p.selected].id;
|
||||||
//
|
// llama_sampler_accept(smpl, token);
|
||||||
// At this point, this is mostly a convenience function.
|
// return token;
|
||||||
//
|
// Returns the sampled token
|
||||||
LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
|
LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
|
||||||
|
|
||||||
// TODO: extend in the future
|
// TODO: extend in the future
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
# Usage:
|
# Usage:
|
||||||
#
|
#
|
||||||
# $ cd /path/to/llama.cpp
|
# $ cd /path/to/llama.cpp
|
||||||
# $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2...
|
# $ ./scripts/sync-ggml-am.sh -skip hash0,hash1,hash2... -C 3
|
||||||
#
|
#
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
@ -25,9 +25,23 @@ lc=$(cat $SRC_LLAMA/scripts/sync-ggml.last)
|
||||||
echo "Syncing ggml changes since commit $lc"
|
echo "Syncing ggml changes since commit $lc"
|
||||||
|
|
||||||
to_skip=""
|
to_skip=""
|
||||||
if [ "$1" == "-skip" ]; then
|
|
||||||
to_skip=$2
|
# context for git patches in number of lines
|
||||||
fi
|
ctx="8"
|
||||||
|
|
||||||
|
while [ "$1" != "" ]; do
|
||||||
|
case $1 in
|
||||||
|
-skip )
|
||||||
|
shift
|
||||||
|
to_skip=$1
|
||||||
|
;;
|
||||||
|
-C )
|
||||||
|
shift
|
||||||
|
ctx=$1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
shift
|
||||||
|
done
|
||||||
|
|
||||||
cd $SRC_GGML
|
cd $SRC_GGML
|
||||||
|
|
||||||
|
@ -52,7 +66,7 @@ while read c; do
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
git format-patch -k $c~1..$c --stdout -- \
|
git format-patch -U${ctx} -k $c~1..$c --stdout -- \
|
||||||
CMakeLists.txt \
|
CMakeLists.txt \
|
||||||
src/CMakeLists.txt \
|
src/CMakeLists.txt \
|
||||||
cmake/FindSIMD.cmake \
|
cmake/FindSIMD.cmake \
|
||||||
|
@ -191,7 +205,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
||||||
> ggml-src.patch.tmp
|
> ggml-src.patch.tmp
|
||||||
mv ggml-src.patch.tmp ggml-src.patch
|
mv ggml-src.patch.tmp ggml-src.patch
|
||||||
|
|
||||||
git am ggml-src.patch
|
git am -C${ctx} ggml-src.patch
|
||||||
|
|
||||||
rm -v $SRC_LLAMA/ggml-src.patch
|
rm -v $SRC_LLAMA/ggml-src.patch
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
28b7633d733bbeef0026570fbc61c79c5e9aa5ae
|
10e83a412717c20d57ba19f025248e18e43addf3
|
||||||
|
|
|
@ -101,6 +101,10 @@ struct ring_buffer {
|
||||||
}
|
}
|
||||||
|
|
||||||
void push_back(const T & value) {
|
void push_back(const T & value) {
|
||||||
|
if (capacity == 0) {
|
||||||
|
throw std::runtime_error("ring buffer: capacity is zero");
|
||||||
|
}
|
||||||
|
|
||||||
if (sz == capacity) {
|
if (sz == capacity) {
|
||||||
// advance the start when buffer is full
|
// advance the start when buffer is full
|
||||||
first = (first + 1) % capacity;
|
first = (first + 1) % capacity;
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -23,16 +23,6 @@ struct llama_sampler_chain {
|
||||||
mutable int32_t n_sample;
|
mutable int32_t n_sample;
|
||||||
};
|
};
|
||||||
|
|
||||||
using llama_token_cnt = std::unordered_map<llama_token, int>;
|
|
||||||
|
|
||||||
// TODO: tmp exposed until test-sampling is fixed
|
|
||||||
void llama_sampler_penalties_impl(
|
|
||||||
llama_token_data_array * cur_p,
|
|
||||||
const llama_token_cnt & token_count,
|
|
||||||
float penalty_repeat,
|
|
||||||
float penalty_freq,
|
|
||||||
float penalty_present);
|
|
||||||
|
|
||||||
struct llama_sampler * llama_sampler_init_grammar_impl(
|
struct llama_sampler * llama_sampler_init_grammar_impl(
|
||||||
const struct llama_vocab & vocab,
|
const struct llama_vocab & vocab,
|
||||||
const char * grammar_str,
|
const char * grammar_str,
|
||||||
|
|
|
@ -6399,6 +6399,11 @@ static void llm_load_vocab(
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
vocab.special_eot_id = t.second;
|
vocab.special_eot_id = t.second;
|
||||||
|
if ((vocab.id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
|
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
|
__func__, t.first.c_str());
|
||||||
|
vocab.id_to_token[t.second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -6412,6 +6417,11 @@ static void llm_load_vocab(
|
||||||
const auto & t = vocab.token_to_id.find("<|eom_id|>");
|
const auto & t = vocab.token_to_id.find("<|eom_id|>");
|
||||||
if (t != vocab.token_to_id.end()) {
|
if (t != vocab.token_to_id.end()) {
|
||||||
vocab.special_eom_id = t->second;
|
vocab.special_eom_id = t->second;
|
||||||
|
if ((vocab.id_to_token[t->second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) {
|
||||||
|
LLAMA_LOG_WARN("%s: control-looking token: '%s' was not control-type; this is probably a bug in the model. its type will be overridden\n",
|
||||||
|
__func__, t->first.c_str());
|
||||||
|
vocab.id_to_token[t->second].attr = LLAMA_TOKEN_ATTR_CONTROL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9248,7 +9258,7 @@ static struct ggml_tensor * llm_build_copy_mask_state(
|
||||||
// FIXME: zero-out NANs?
|
// FIXME: zero-out NANs?
|
||||||
states = ggml_mul(ctx, states, state_mask);
|
states = ggml_mul(ctx, states, state_mask);
|
||||||
|
|
||||||
// copy states which won't be changed further (between n_seqs and n_rs)
|
// copy states which won't be changed further (between n_seqs and n_kv)
|
||||||
ggml_build_forward_expand(graph,
|
ggml_build_forward_expand(graph,
|
||||||
ggml_cpy(ctx,
|
ggml_cpy(ctx,
|
||||||
ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
|
ggml_view_1d(ctx, states, n_state*(n_kv - n_seqs), n_seqs*n_state*ggml_element_size(states)),
|
||||||
|
@ -16067,7 +16077,7 @@ static int llama_decode_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
for (uint32_t i = 0; i < n_tokens_all; ++i) {
|
||||||
if (batch_all.token[i] < 0) {
|
if (batch_all.token[i] < 0 || (uint32_t)batch_all.token[i] >= lctx.model.vocab.n_vocab) {
|
||||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch_all.token[i]);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -16366,7 +16376,7 @@ static int llama_encode_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_tokens; ++i) {
|
for (uint32_t i = 0; i < n_tokens; ++i) {
|
||||||
if (batch.token[i] < 0) {
|
if (batch.token[i] < 0 || (uint32_t)batch.token[i] >= lctx.model.vocab.n_vocab) {
|
||||||
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
LLAMA_LOG_ERROR("%s: invalid token[%d] = %d", __func__, i, batch.token[i]);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -17520,6 +17530,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
quantize &= name.find("time_mix_first.weight") == std::string::npos;
|
||||||
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
quantize &= name.find("time_mix_w1.weight") == std::string::npos;
|
||||||
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
quantize &= name.find("time_mix_w2.weight") == std::string::npos;
|
||||||
|
quantize &= name.find("time_mix_decay_w1.weight") == std::string::npos;
|
||||||
|
quantize &= name.find("time_mix_decay_w2.weight") == std::string::npos;
|
||||||
|
|
||||||
// do not quantize relative position bias (T5)
|
// do not quantize relative position bias (T5)
|
||||||
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
quantize &= name.find("attn_rel_b.weight") == std::string::npos;
|
||||||
|
|
|
@ -1,19 +1,43 @@
|
||||||
|
#include "arg.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
#include <unordered_set>
|
||||||
|
|
||||||
#undef NDEBUG
|
#undef NDEBUG
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
|
||||||
#include "common.h"
|
|
||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
|
printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n");
|
||||||
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
|
for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
|
||||||
try {
|
try {
|
||||||
gpt_params_parser_init(params, (enum llama_example)ex);
|
auto ctx_arg = gpt_params_parser_init(params, (enum llama_example)ex);
|
||||||
|
std::unordered_set<std::string> seen_args;
|
||||||
|
std::unordered_set<std::string> seen_env_vars;
|
||||||
|
for (const auto & opt : ctx_arg.options) {
|
||||||
|
// check for args duplications
|
||||||
|
for (const auto & arg : opt.args) {
|
||||||
|
if (seen_args.find(arg) == seen_args.end()) {
|
||||||
|
seen_args.insert(arg);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// check for env var duplications
|
||||||
|
if (opt.env) {
|
||||||
|
if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
|
||||||
|
seen_env_vars.insert(opt.env);
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
} catch (std::exception & e) {
|
} catch (std::exception & e) {
|
||||||
printf("%s\n", e.what());
|
printf("%s\n", e.what());
|
||||||
assert(false);
|
assert(false);
|
||||||
|
@ -29,40 +53,51 @@ int main(void) {
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<std::string> argv;
|
std::vector<std::string> argv;
|
||||||
auto options = gpt_params_parser_init(params, LLAMA_EXAMPLE_COMMON);
|
|
||||||
|
|
||||||
printf("test-arg-parser: test invalid usage\n\n");
|
printf("test-arg-parser: test invalid usage\n\n");
|
||||||
|
|
||||||
|
// missing value
|
||||||
argv = {"binary_name", "-m"};
|
argv = {"binary_name", "-m"};
|
||||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
|
||||||
|
// wrong value (int)
|
||||||
argv = {"binary_name", "-ngl", "hello"};
|
argv = {"binary_name", "-ngl", "hello"};
|
||||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
|
||||||
|
// wrong value (enum)
|
||||||
argv = {"binary_name", "-sm", "hello"};
|
argv = {"binary_name", "-sm", "hello"};
|
||||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
|
||||||
|
// non-existence arg in specific example (--draft cannot be used outside llama-speculative)
|
||||||
|
argv = {"binary_name", "--draft", "123"};
|
||||||
|
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SERVER));
|
||||||
|
|
||||||
|
|
||||||
printf("test-arg-parser: test valid usage\n\n");
|
printf("test-arg-parser: test valid usage\n\n");
|
||||||
|
|
||||||
argv = {"binary_name", "-m", "model_file.gguf"};
|
argv = {"binary_name", "-m", "model_file.gguf"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.model == "model_file.gguf");
|
assert(params.model == "model_file.gguf");
|
||||||
|
|
||||||
argv = {"binary_name", "-t", "1234"};
|
argv = {"binary_name", "-t", "1234"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.cpuparams.n_threads == 1234);
|
assert(params.cpuparams.n_threads == 1234);
|
||||||
|
|
||||||
argv = {"binary_name", "--verbose"};
|
argv = {"binary_name", "--verbose"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.verbosity == 1);
|
assert(params.verbosity == 1);
|
||||||
|
|
||||||
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
|
argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.model == "abc.gguf");
|
assert(params.model == "abc.gguf");
|
||||||
assert(params.n_predict == 6789);
|
assert(params.n_predict == 6789);
|
||||||
assert(params.n_batch == 9090);
|
assert(params.n_batch == 9090);
|
||||||
|
|
||||||
|
// --draft cannot be used outside llama-speculative
|
||||||
|
argv = {"binary_name", "--draft", "123"};
|
||||||
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_SPECULATIVE));
|
||||||
|
assert(params.n_draft == 123);
|
||||||
|
|
||||||
// skip this part on windows, because setenv is not supported
|
// skip this part on windows, because setenv is not supported
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
printf("test-arg-parser: skip on windows build\n");
|
printf("test-arg-parser: skip on windows build\n");
|
||||||
|
@ -71,12 +106,12 @@ int main(void) {
|
||||||
|
|
||||||
setenv("LLAMA_ARG_THREADS", "blah", true);
|
setenv("LLAMA_ARG_THREADS", "blah", true);
|
||||||
argv = {"binary_name"};
|
argv = {"binary_name"};
|
||||||
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(false == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
|
|
||||||
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
||||||
setenv("LLAMA_ARG_THREADS", "1010", true);
|
setenv("LLAMA_ARG_THREADS", "1010", true);
|
||||||
argv = {"binary_name"};
|
argv = {"binary_name"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.model == "blah.gguf");
|
assert(params.model == "blah.gguf");
|
||||||
assert(params.cpuparams.n_threads == 1010);
|
assert(params.cpuparams.n_threads == 1010);
|
||||||
|
|
||||||
|
@ -86,7 +121,7 @@ int main(void) {
|
||||||
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
setenv("LLAMA_ARG_MODEL", "blah.gguf", true);
|
||||||
setenv("LLAMA_ARG_THREADS", "1010", true);
|
setenv("LLAMA_ARG_THREADS", "1010", true);
|
||||||
argv = {"binary_name", "-m", "overwritten.gguf"};
|
argv = {"binary_name", "-m", "overwritten.gguf"};
|
||||||
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, options));
|
assert(true == gpt_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
|
||||||
assert(params.model == "overwritten.gguf");
|
assert(params.model == "overwritten.gguf");
|
||||||
assert(params.cpuparams.n_threads == 1010);
|
assert(params.cpuparams.n_threads == 1010);
|
||||||
#endif // _WIN32
|
#endif // _WIN32
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -148,15 +148,17 @@ static void test_penalties(
|
||||||
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
cur.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_cnt token_count;
|
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
||||||
|
|
||||||
|
auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false);
|
||||||
|
|
||||||
for (size_t i = 0; i < last_tokens.size(); i++) {
|
for (size_t i = 0; i < last_tokens.size(); i++) {
|
||||||
token_count[last_tokens[i]]++;
|
llama_sampler_accept(sampler, last_tokens[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token_data_array cur_p = { cur.data(), cur.size(), -1, false };
|
|
||||||
APPLY(llama_sampler_init_softmax(), &cur_p);
|
APPLY(llama_sampler_init_softmax(), &cur_p);
|
||||||
DUMP(&cur_p);
|
DUMP(&cur_p);
|
||||||
llama_sampler_penalties_impl(&cur_p, token_count, repeat_penalty, alpha_frequency, alpha_presence); // TODO: avoid
|
APPLY(sampler, &cur_p);
|
||||||
APPLY(llama_sampler_init_softmax(), &cur_p);
|
APPLY(llama_sampler_init_softmax(), &cur_p);
|
||||||
DUMP(&cur_p);
|
DUMP(&cur_p);
|
||||||
|
|
||||||
|
@ -243,7 +245,7 @@ static void test_sampler_queue(const size_t n_vocab, const std::string & sampler
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
|
printf("Sampler queue %3s OK with n_vocab=%05zu top_k=%05d top_p=%f min_p=%f\n",
|
||||||
samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
|
samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue