common : more explicit includes

This commit is contained in:
Georgi Gerganov 2024-09-09 18:22:25 +03:00
parent 3e03807043
commit 6412a598a1
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
31 changed files with 169 additions and 152 deletions

View file

@ -1,5 +1,7 @@
#include "arg.h" #include "arg.h"
#include "sampling.h"
#include <algorithm> #include <algorithm>
#include <string> #include <string>
#include <vector> #include <vector>
@ -341,10 +343,6 @@ bool gpt_params_parse(int argc, char ** argv, llama_arg_context & ctx_arg) {
return true; return true;
} }
llama_arg_context gpt_params_parser_init(gpt_params & params, llama_example ex) {
return gpt_params_parser_init(params, ex, nullptr);
}
llama_arg_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) { llama_arg_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)) {
llama_arg_context ctx_arg(params); llama_arg_context ctx_arg(params);
ctx_arg.print_usage = print_usage; ctx_arg.print_usage = print_usage;

View file

@ -2,37 +2,14 @@
#include "common.h" #include "common.h"
#include <set>
#include <string> #include <string>
#include <vector> #include <vector>
#include <set>
// //
// CLI argument parsing // CLI argument parsing
// //
struct gpt_params;
enum llama_example {
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_MAIN,
LLAMA_EXAMPLE_INFILL,
LLAMA_EXAMPLE_EMBEDDING,
LLAMA_EXAMPLE_PERPLEXITY,
LLAMA_EXAMPLE_RETRIEVAL,
LLAMA_EXAMPLE_PASSKEY,
LLAMA_EXAMPLE_IMATRIX,
LLAMA_EXAMPLE_BENCH,
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_COUNT,
};
struct llama_arg { struct llama_arg {
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON}; std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
std::vector<const char *> args; std::vector<const char *> args;
@ -92,10 +69,8 @@ struct llama_arg_context {
llama_arg_context(gpt_params & params) : params(params) {} llama_arg_context(gpt_params & params) : params(params) {}
}; };
// initialize list of options (arguments) that can be used by the current example
llama_arg_context gpt_params_parser_init(gpt_params & params, llama_example ex);
// optionally, we can provide "print_usage" to print example usage // optionally, we can provide "print_usage" to print example usage
llama_arg_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **)); llama_arg_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
// parse input arguments from CLI // parse input arguments from CLI
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message) // if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)

View file

@ -4,21 +4,11 @@
#include "llama.h" #include "llama.h"
#include "sampling.h"
#include "arg.h"
#define LOG_NO_FILE_LINE_FUNCTION #define LOG_NO_FILE_LINE_FUNCTION
#include "log.h" #include "log.h"
#include <cmath>
#include <string> #include <string>
#include <vector> #include <vector>
#include <random>
#include <thread>
#include <set>
#include <unordered_map>
#include <tuple>
#include <functional>
#ifdef _WIN32 #ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\' #define DIRECTORY_SEPARATOR '\\'
@ -57,19 +47,6 @@ struct llama_control_vector_load_info;
// CPU utils // CPU utils
// //
int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();
//
// Common params
//
// dimensionality reduction methods, used by cvector-generator
enum dimre_method {
DIMRE_METHOD_PCA,
DIMRE_METHOD_MEAN,
};
struct cpu_params { struct cpu_params {
int n_threads = -1; int n_threads = -1;
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
@ -79,6 +56,92 @@ struct cpu_params {
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
}; };
int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();
//
// Common params
//
enum llama_example {
LLAMA_EXAMPLE_COMMON,
LLAMA_EXAMPLE_SPECULATIVE,
LLAMA_EXAMPLE_MAIN,
LLAMA_EXAMPLE_INFILL,
LLAMA_EXAMPLE_EMBEDDING,
LLAMA_EXAMPLE_PERPLEXITY,
LLAMA_EXAMPLE_RETRIEVAL,
LLAMA_EXAMPLE_PASSKEY,
LLAMA_EXAMPLE_IMATRIX,
LLAMA_EXAMPLE_BENCH,
LLAMA_EXAMPLE_SERVER,
LLAMA_EXAMPLE_CVECTOR_GENERATOR,
LLAMA_EXAMPLE_EXPORT_LORA,
LLAMA_EXAMPLE_LLAVA,
LLAMA_EXAMPLE_LOOKUP,
LLAMA_EXAMPLE_PARALLEL,
LLAMA_EXAMPLE_COUNT,
};
enum gpt_sampler_type {
GPT_SAMPLER_TYPE_NONE = 0,
GPT_SAMPLER_TYPE_TOP_K = 1,
GPT_SAMPLER_TYPE_TOP_P = 2,
GPT_SAMPLER_TYPE_MIN_P = 3,
GPT_SAMPLER_TYPE_TFS_Z = 4,
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
};
// dimensionality reduction methods, used by cvector-generator
enum dimre_method {
DIMRE_METHOD_PCA,
DIMRE_METHOD_MEAN,
};
// sampler parameters
struct gpt_sampler_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typ_p = 1.00f; // typical_p, 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
std::vector<enum gpt_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K,
GPT_SAMPLER_TYPE_TFS_Z,
GPT_SAMPLER_TYPE_TYPICAL_P,
GPT_SAMPLER_TYPE_TOP_P,
GPT_SAMPLER_TYPE_MIN_P,
GPT_SAMPLER_TYPE_TEMPERATURE
};
std::string grammar; // optional BNF-like grammar to constrain sampling
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
// print the parameters into a string
std::string print() const;
};
struct gpt_params { struct gpt_params {
int32_t n_predict = -1; // new tokens to predict int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 0; // context size int32_t n_ctx = 0; // context size
@ -123,23 +186,23 @@ struct gpt_params {
struct gpt_sampler_params sparams; struct gpt_sampler_params sparams;
std::string model = ""; // model path std::string model = ""; // model path // NOLINT
std::string model_draft = ""; // draft model for speculative decoding std::string model_draft = ""; // draft model for speculative decoding // NOLINT
std::string model_alias = "unknown"; // model alias std::string model_alias = "unknown"; // model alias // NOLINT
std::string model_url = ""; // model url to download std::string model_url = ""; // model url to download // NOLINT
std::string hf_token = ""; // HF token std::string hf_token = ""; // HF token // NOLINT
std::string hf_repo = ""; // HF repo std::string hf_repo = ""; // HF repo // NOLINT
std::string hf_file = ""; // HF file std::string hf_file = ""; // HF file // NOLINT
std::string prompt = ""; std::string prompt = ""; // NOLINT
std::string prompt_file = ""; // store the external prompt file name std::string prompt_file = ""; // store the external prompt file name // NOLINT
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state // NOLINT
std::string input_prefix = ""; // string to prefix user inputs with std::string input_prefix = ""; // string to prefix user inputs with // NOLINT
std::string input_suffix = ""; // string to suffix user inputs with std::string input_suffix = ""; // string to suffix user inputs with // NOLINT
std::string logdir = ""; // directory in which to save YAML log files std::string logdir = ""; // directory in which to save YAML log files // NOLINT
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits std::string logits_file = ""; // file for saving *all* logits // NOLINT
std::string rpc_servers = ""; // comma separated list of RPC servers std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
std::vector<std::string> in_files; // all input files std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
@ -200,7 +263,7 @@ struct gpt_params {
std::string cache_type_v = "f16"; // KV cache data type for the V std::string cache_type_v = "f16"; // KV cache data type for the V
// multimodal models (see examples/llava) // multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector std::string mmproj = ""; // path to multimodal projector // NOLINT
std::vector<std::string> image; // path to image file(s) std::vector<std::string> image; // path to image file(s)
// embedding // embedding
@ -216,15 +279,15 @@ struct gpt_params {
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = ""; std::string public_path = ""; // NOLINT
std::string chat_template = ""; std::string chat_template = ""; // NOLINT
std::string system_prompt = ""; std::string system_prompt = ""; // NOLINT
bool enable_chat_template = true; bool enable_chat_template = true;
std::vector<std::string> api_keys; std::vector<std::string> api_keys;
std::string ssl_file_key = ""; std::string ssl_file_key = ""; // NOLINT
std::string ssl_file_cert = ""; std::string ssl_file_cert = ""; // NOLINT
bool endpoint_slots = true; bool endpoint_slots = true;
bool endpoint_metrics = false; bool endpoint_metrics = false;

View file

@ -2,61 +2,11 @@
#include "llama.h" #include "llama.h"
#include "common.h"
#include <string> #include <string>
#include <vector> #include <vector>
enum gpt_sampler_type {
GPT_SAMPLER_TYPE_NONE = 0,
GPT_SAMPLER_TYPE_TOP_K = 1,
GPT_SAMPLER_TYPE_TOP_P = 2,
GPT_SAMPLER_TYPE_MIN_P = 3,
GPT_SAMPLER_TYPE_TFS_Z = 4,
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
};
// sampling parameters
struct gpt_sampler_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
int32_t n_prev = 64; // number of previous tokens to remember
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float min_p = 0.05f; // 0.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typ_p = 1.00f; // typical_p, 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
std::vector<enum gpt_sampler_type> samplers = {
GPT_SAMPLER_TYPE_TOP_K,
GPT_SAMPLER_TYPE_TFS_Z,
GPT_SAMPLER_TYPE_TYPICAL_P,
GPT_SAMPLER_TYPE_TOP_P,
GPT_SAMPLER_TYPE_MIN_P,
GPT_SAMPLER_TYPE_TEMPERATURE
};
std::string grammar; // optional BNF-like grammar to constrain sampling
std::vector<llama_logit_bias> logit_bias; // logit biases to apply
// print the parameters into a string
std::string print() const;
};
// gpt_sampler extends llama_sampler with additional functionality: // gpt_sampler extends llama_sampler with additional functionality:
// //
// - grammar support // - grammar support

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"

View file

@ -12,12 +12,9 @@
#include <cstdio> #include <cstdio>
#include <ctime> #include <ctime>
#include <random>
#include <string> #include <string>
#include <tuple>
#include <vector> #include <vector>
#include <algorithm>
#include <iostream>
#include <fstream>
#define DEBUG_POS 5 #define DEBUG_POS 5

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "ggml.h" #include "ggml.h"
#include "ggml-alloc.h" #include "ggml-alloc.h"

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include <fstream> #include <fstream>

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -1,6 +1,7 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "console.h" #include "console.h"
#include "sampling.h"
#include "llama.h" #include "llama.h"
#include <cassert> #include <cassert>

View file

@ -1,11 +1,12 @@
#include "ggml.h" #include "arg.h"
#include "base64.hpp"
#include "log.h" #include "log.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "clip.h" #include "clip.h"
#include "llava.h" #include "llava.h"
#include "llama.h" #include "llama.h"
#include "ggml.h"
#include "base64.hpp"
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>

View file

@ -1,9 +1,11 @@
#include "ggml.h" #include "arg.h"
#include "log.h" #include "log.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "clip.h" #include "clip.h"
#include "llava.h" #include "llava.h"
#include "llama.h" #include "llama.h"
#include "ggml.h"
#include <cstdio> #include <cstdio>
#include <cstdlib> #include <cstdlib>

View file

@ -1,4 +1,6 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "llama.h" #include "llama.h"
#include <cstdio> #include <cstdio>

View file

@ -1,7 +1,8 @@
#include "ggml.h" #include "arg.h"
#include "llama.h"
#include "common.h" #include "common.h"
#include "ngram-cache.h" #include "ngram-cache.h"
#include "ggml.h"
#include "llama.h"
#include <cstdint> #include <cstdint>
#include <fstream> #include <fstream>
@ -40,4 +41,6 @@ int main(int argc, char ** argv){
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static); llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
return 0;
} }

View file

@ -1,8 +1,9 @@
#include "ggml.h" #include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h"
#include "log.h" #include "log.h"
#include "ngram-cache.h" #include "ngram-cache.h"
#include "llama.h"
#include "ggml.h"
#include <cmath> #include <cmath>
#include <cstdint> #include <cstdint>

View file

@ -1,7 +1,9 @@
#include "arg.h"
#include "ggml.h" #include "ggml.h"
#include "llama.h"
#include "common.h" #include "common.h"
#include "ngram-cache.h" #include "ngram-cache.h"
#include "sampling.h"
#include "llama.h"
#include <cstdint> #include <cstdint>
#include <cstdio> #include <cstdio>

View file

@ -1,6 +1,7 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "console.h" #include "console.h"
#include "sampling.h"
#include "llama.h" #include "llama.h"
#include <cassert> #include <cassert>

View file

@ -1,7 +1,9 @@
// A basic application simulating a server with multiple clients. // A basic application simulating a server with multiple clients.
// The clients submit requests to the server and they are processed in parallel. // The clients submit requests to the server and they are processed in parallel.
#include "arg.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "llama.h" #include "llama.h"
#include <cmath> #include <cmath>

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -1,18 +1,19 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include <array>
#include <atomic>
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <ctime> #include <ctime>
#include <fstream>
#include <mutex>
#include <random>
#include <sstream> #include <sstream>
#include <thread> #include <thread>
#include <mutex>
#include <atomic>
#include <vector> #include <vector>
#include <array>
#include <fstream>
#include <sstream>
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -1,6 +1,8 @@
#include "utils.hpp" #include "utils.hpp"
#include "arg.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "json-schema-to-grammar.h" #include "json-schema-to-grammar.h"
#include "llama.h" #include "llama.h"

View file

@ -1,3 +1,4 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"

View file

@ -1,11 +1,13 @@
#include "arg.h"
#include "common.h" #include "common.h"
#include "sampling.h"
#include "llama.h" #include "llama.h"
#include <cmath>
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <vector> #include <vector>
#include <set> #include <set>
#include <random>
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100 #define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5 #define SPEC_VOCAB_CHECK_START_TOKEN_ID 5

View file

@ -1,3 +1,6 @@
#include "arg.h"
#include "common.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include <sstream> #include <sstream>
@ -6,8 +9,6 @@
#undef NDEBUG #undef NDEBUG
#include <cassert> #include <cassert>
#include "common.h"
int main(void) { int main(void) {
gpt_params params; gpt_params params;