Merge branch 'master' into concedo_experimental
# Conflicts: # .github/workflows/build.yml # README.md # build.zig # flake.nix
This commit is contained in:
commit
bd2500db36
6 changed files with 64 additions and 49 deletions
|
@ -647,9 +647,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" --cfg-negative-prompt-file FNAME\n");
|
||||
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
||||
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
||||
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
|
||||
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
|
||||
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
|
||||
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n");
|
||||
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n");
|
||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||
|
||||
if (plan.work_size > 0) {
|
||||
|
@ -32,7 +32,7 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
|
|||
ggml_graph_compute(graph, &plan);
|
||||
}
|
||||
|
||||
float tensor_sum_elements(const ggml_tensor * tensor) {
|
||||
static float tensor_sum_elements(const ggml_tensor * tensor) {
|
||||
double sum = 0;
|
||||
if (tensor->type == GGML_TYPE_F32) {
|
||||
for (int j = 0; j < tensor->ne[1]; j++) {
|
||||
|
@ -44,7 +44,7 @@ float tensor_sum_elements(const ggml_tensor * tensor) {
|
|||
return sum;
|
||||
}
|
||||
|
||||
void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||
static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
|
||||
tensor->type, ggml_type_name(tensor->type),
|
||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
|
||||
|
@ -59,7 +59,7 @@ struct benchmark_params_struct {
|
|||
int32_t n_iterations = 10;
|
||||
};
|
||||
|
||||
void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
|
||||
static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
|
@ -253,7 +253,7 @@ int main(int argc, char ** argv) {
|
|||
// Check that the matrix multiplication result is in the right ballpark
|
||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
|
||||
float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||
|
||||
if (delta > allowed_delta) {
|
||||
|
|
|
@ -1,3 +1,21 @@
|
|||
# embedding
|
||||
# llama.cpp/example/embedding
|
||||
|
||||
TODO
|
||||
This example demonstrates generate high-dimensional embedding vector of a given text with llama.cpp.
|
||||
|
||||
## Quick Start
|
||||
|
||||
To get started right away, run the following command, making sure to use the correct path for the model you have:
|
||||
|
||||
### Unix-based systems (Linux, macOS, etc.):
|
||||
|
||||
```bash
|
||||
./embedding -m ./path/to/model --log-disable -p "Hello World!" 2>/dev/null
|
||||
```
|
||||
|
||||
### Windows:
|
||||
|
||||
```powershell
|
||||
embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||
```
|
||||
|
||||
The above command will output space-separated float values.
|
||||
|
|
|
@ -701,8 +701,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
|
|
|
@ -850,7 +850,7 @@ std::array<std::string, 2> mul_str_values = {
|
|||
"mul_f32", "float"
|
||||
};
|
||||
|
||||
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
||||
static std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(from, pos)) != std::string::npos) {
|
||||
s.replace(pos, from.length(), to);
|
||||
|
@ -859,7 +859,7 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
|
|||
return s;
|
||||
}
|
||||
|
||||
std::string generate_kernels() {
|
||||
static std::string generate_kernels() {
|
||||
std::stringstream src;
|
||||
src << program_source << '\n';
|
||||
src << k_quants_source << '\n';
|
||||
|
@ -1808,7 +1808,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
|||
return false;
|
||||
}
|
||||
|
||||
bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
||||
static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
||||
// If device doesn't support FP16
|
||||
if (!fp16_support) {
|
||||
return false;
|
||||
|
|
65
llama.cpp
65
llama.cpp
|
@ -934,23 +934,22 @@ static const size_t kB = 1024;
|
|||
static const size_t MB = kB*kB;
|
||||
static const size_t GB = kB*kB*kB;
|
||||
|
||||
// default hparams (LLaMA 7B)
|
||||
struct llama_hparams {
|
||||
uint32_t n_vocab = 32000;
|
||||
uint32_t n_ctx_train = 2048; // the context size used during training
|
||||
uint32_t n_ctx = 512; // the context size used during inference
|
||||
uint32_t n_embd = 4096;
|
||||
uint32_t n_head = 32;
|
||||
uint32_t n_head_kv = 32;
|
||||
uint32_t n_layer = 32;
|
||||
uint32_t n_rot = 64;
|
||||
uint32_t n_ff = 11008;
|
||||
uint32_t n_vocab;
|
||||
uint32_t n_ctx_train; // context size the model was trained on
|
||||
uint32_t n_ctx; // context size used during inference
|
||||
uint32_t n_embd;
|
||||
uint32_t n_head;
|
||||
uint32_t n_head_kv;
|
||||
uint32_t n_layer;
|
||||
uint32_t n_rot;
|
||||
uint32_t n_ff;
|
||||
|
||||
float f_norm_eps = 1e-5;
|
||||
float f_norm_rms_eps = 1e-5;
|
||||
float f_norm_eps;
|
||||
float f_norm_rms_eps;
|
||||
|
||||
float rope_freq_base = 10000.0f;
|
||||
float rope_freq_scale = 1.0f;
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT
|
||||
|
@ -1081,7 +1080,7 @@ struct llama_model {
|
|||
|
||||
std::string name = "n/a";
|
||||
|
||||
llama_hparams hparams;
|
||||
llama_hparams hparams = {};
|
||||
llama_vocab vocab;
|
||||
|
||||
struct ggml_tensor * tok_embeddings;
|
||||
|
@ -1680,28 +1679,17 @@ static void llm_load_hparams(
|
|||
hparams.n_head_kv = hparams.n_head;
|
||||
GGUF_GET_KEY(ctx, hparams.n_head_kv, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ATTENTION_HEAD_COUNT_KV));
|
||||
|
||||
// TODO: manually setting rope freq base and scale should override this
|
||||
// FIXME: partial fix when the param specified is not the default value, but
|
||||
// will not work for overriding the model value to the params default
|
||||
|
||||
llama_context_params defaults = llama_context_default_params();
|
||||
|
||||
// rope_freq_base
|
||||
{
|
||||
float ropebase = 10000.0f;
|
||||
GGUF_GET_KEY(ctx, ropebase, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
||||
if (ropebase != 10000.0f && rope_freq_base == defaults.rope_freq_base) {
|
||||
rope_freq_base = ropebase;
|
||||
}
|
||||
// rope_freq_base (optional)
|
||||
if (rope_freq_base == 0.0f) {
|
||||
rope_freq_base = 10000.0f;
|
||||
GGUF_GET_KEY(ctx, rope_freq_base, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_FREQ_BASE));
|
||||
}
|
||||
|
||||
// rope_freq_scale (inverse of the kv) is optional
|
||||
{
|
||||
if (rope_freq_scale == 0.0f) {
|
||||
float ropescale = 1.0f;
|
||||
GGUF_GET_KEY(ctx, ropescale, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ROPE_SCALE_LINEAR));
|
||||
if (ropescale != 1.0f && rope_freq_scale == defaults.rope_freq_scale) {
|
||||
rope_freq_scale = 1.0f/ropescale;
|
||||
}
|
||||
rope_freq_scale = 1.0f/ropescale;
|
||||
}
|
||||
|
||||
// sanity check for n_rot (optional)
|
||||
|
@ -3787,6 +3775,15 @@ static bool llama_eval_internal(
|
|||
n_threads = std::min(4, n_threads);
|
||||
}
|
||||
|
||||
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
||||
const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA ||
|
||||
model.arch == LLM_ARCH_BAICHUAN ||
|
||||
model.arch == LLM_ARCH_FALCON;
|
||||
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
||||
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
||||
n_threads = 1;
|
||||
}
|
||||
|
||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
||||
|
||||
|
@ -6201,8 +6198,8 @@ struct llama_context_params llama_context_default_params() {
|
|||
/*.n_gpu_layers =*/ 0,
|
||||
/*.main_gpu =*/ 0,
|
||||
/*.tensor_split =*/ nullptr,
|
||||
/*.rope_freq_base =*/ 10000.0f,
|
||||
/*.rope_freq_scale =*/ 1.0f,
|
||||
/*.rope_freq_base =*/ 0.0f,
|
||||
/*.rope_freq_scale =*/ 0.0f,
|
||||
/*.progress_callback =*/ nullptr,
|
||||
/*.progress_callback_user_data =*/ nullptr,
|
||||
/*.low_vram =*/ false,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue