Merge branch 'ggerganov:master' into iq2_s
This commit is contained in:
commit
4b7aaae8f3
7 changed files with 22 additions and 15 deletions
1
.github/workflows/close-issue.yml
vendored
1
.github/workflows/close-issue.yml
vendored
|
@ -12,6 +12,7 @@ jobs:
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@v5
|
- uses: actions/stale@v5
|
||||||
with:
|
with:
|
||||||
|
exempt-issue-labels: "refactor,help wanted,good first issue,research"
|
||||||
days-before-issue-stale: 30
|
days-before-issue-stale: 30
|
||||||
days-before-issue-close: 14
|
days-before-issue-close: 14
|
||||||
stale-issue-label: "stale"
|
stale-issue-label: "stale"
|
||||||
|
|
|
@ -1056,7 +1056,8 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "-h" || arg == "--help") {
|
if (arg == "-h" || arg == "--help") {
|
||||||
return false;
|
gpt_print_usage(argc, argv, gpt_params());
|
||||||
|
exit(0);
|
||||||
}
|
}
|
||||||
if (arg == "--version") {
|
if (arg == "--version") {
|
||||||
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||||
|
|
|
@ -32,13 +32,13 @@ typedef struct llama_sampling_params {
|
||||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||||
float penalty_present = 0.00f; // 0.0 = disabled
|
float penalty_present = 0.00f; // 0.0 = disabled
|
||||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||||
float mirostat_tau = 5.00f; // target entropy
|
float mirostat_tau = 5.00f; // target entropy
|
||||||
float mirostat_eta = 0.10f; // learning rate
|
float mirostat_eta = 0.10f; // learning rate
|
||||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||||
|
|
||||||
std::vector<llama_sampler_type> samplers_sequence = {
|
std::vector<llama_sampler_type> samplers_sequence = {
|
||||||
llama_sampler_type::TOP_K,
|
llama_sampler_type::TOP_K,
|
||||||
|
|
|
@ -497,7 +497,6 @@ struct clip_ctx {
|
||||||
|
|
||||||
// memory buffers to evaluate the model
|
// memory buffers to evaluate the model
|
||||||
ggml_backend_buffer_t params_buffer = NULL;
|
ggml_backend_buffer_t params_buffer = NULL;
|
||||||
ggml_backend_buffer_t compute_buffer = NULL;
|
|
||||||
|
|
||||||
ggml_backend_t backend = NULL;
|
ggml_backend_t backend = NULL;
|
||||||
ggml_gallocr_t compute_alloc = NULL;
|
ggml_gallocr_t compute_alloc = NULL;
|
||||||
|
@ -1676,6 +1675,9 @@ void clip_free(clip_ctx * ctx) {
|
||||||
ggml_free(ctx->ctx_data);
|
ggml_free(ctx->ctx_data);
|
||||||
gguf_free(ctx->ctx_gguf);
|
gguf_free(ctx->ctx_gguf);
|
||||||
|
|
||||||
|
ggml_backend_buffer_free(ctx->params_buffer);
|
||||||
|
ggml_backend_free(ctx->backend);
|
||||||
|
ggml_gallocr_free(ctx->compute_alloc);
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1709703039,
|
"lastModified": 1710451336,
|
||||||
"narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=",
|
"narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d",
|
"rev": "d691274a972b3165335d261cc4671335f5c67de9",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
@ -1015,7 +1015,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
||||||
#define GGML_SCHED_MAX_SPLIT_INPUTS 4
|
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_COPIES
|
#ifndef GGML_SCHED_MAX_COPIES
|
||||||
|
|
13
llama.cpp
13
llama.cpp
|
@ -540,6 +540,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{
|
{
|
||||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output"},
|
||||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
|
@ -4300,9 +4301,9 @@ static bool llm_load_tensors(
|
||||||
{
|
{
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
||||||
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
|
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
} else {
|
if (!model.output) {
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||||
ml.n_created--; // artificial tensor
|
ml.n_created--; // artificial tensor
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
ml.size_data += ggml_nbytes(model.output);
|
||||||
|
@ -4507,11 +4508,13 @@ static bool llm_load_tensors(
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
||||||
|
|
||||||
// same as tok_embd, duplicated to allow offloading
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
if (!model.output) {
|
||||||
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||||
ml.n_created--; // artificial tensor
|
ml.n_created--; // artificial tensor
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
ml.size_data += ggml_nbytes(model.output);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue