Merge branch 'ggerganov:master' into iq2_s
This commit is contained in:
commit
4b7aaae8f3
7 changed files with 22 additions and 15 deletions
1
.github/workflows/close-issue.yml
vendored
1
.github/workflows/close-issue.yml
vendored
|
@ -12,6 +12,7 @@ jobs:
|
|||
steps:
|
||||
- uses: actions/stale@v5
|
||||
with:
|
||||
exempt-issue-labels: "refactor,help wanted,good first issue,research"
|
||||
days-before-issue-stale: 30
|
||||
days-before-issue-close: 14
|
||||
stale-issue-label: "stale"
|
||||
|
|
|
@ -1056,7 +1056,8 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
|
|||
return true;
|
||||
}
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
return false;
|
||||
gpt_print_usage(argc, argv, gpt_params());
|
||||
exit(0);
|
||||
}
|
||||
if (arg == "--version") {
|
||||
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||
|
|
|
@ -32,13 +32,13 @@ typedef struct llama_sampling_params {
|
|||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||
float penalty_repeat = 1.00f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
float penalty_present = 0.00f; // 0.0 = disabled
|
||||
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
|
||||
float mirostat_tau = 5.00f; // target entropy
|
||||
float mirostat_eta = 0.10f; // learning rate
|
||||
bool penalize_nl = true; // consider newlines as a repeatable token
|
||||
bool penalize_nl = false; // consider newlines as a repeatable token
|
||||
|
||||
std::vector<llama_sampler_type> samplers_sequence = {
|
||||
llama_sampler_type::TOP_K,
|
||||
|
|
|
@ -497,7 +497,6 @@ struct clip_ctx {
|
|||
|
||||
// memory buffers to evaluate the model
|
||||
ggml_backend_buffer_t params_buffer = NULL;
|
||||
ggml_backend_buffer_t compute_buffer = NULL;
|
||||
|
||||
ggml_backend_t backend = NULL;
|
||||
ggml_gallocr_t compute_alloc = NULL;
|
||||
|
@ -1676,6 +1675,9 @@ void clip_free(clip_ctx * ctx) {
|
|||
ggml_free(ctx->ctx_data);
|
||||
gguf_free(ctx->ctx_gguf);
|
||||
|
||||
ggml_backend_buffer_free(ctx->params_buffer);
|
||||
ggml_backend_free(ctx->backend);
|
||||
ggml_gallocr_free(ctx->compute_alloc);
|
||||
delete ctx;
|
||||
}
|
||||
|
||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
|||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1709703039,
|
||||
"narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=",
|
||||
"lastModified": 1710451336,
|
||||
"narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d",
|
||||
"rev": "d691274a972b3165335d261cc4671335f5c67de9",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
|
@ -1015,7 +1015,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
|||
#endif
|
||||
|
||||
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
||||
#define GGML_SCHED_MAX_SPLIT_INPUTS 4
|
||||
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
||||
#endif
|
||||
|
||||
#ifndef GGML_SCHED_MAX_COPIES
|
||||
|
|
17
llama.cpp
17
llama.cpp
|
@ -540,6 +540,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
|||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output"},
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
|
@ -4300,9 +4301,9 @@ static bool llm_load_tensors(
|
|||
{
|
||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
||||
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
||||
} else {
|
||||
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||
if (!model.output) {
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
|
@ -4507,10 +4508,12 @@ static bool llm_load_tensors(
|
|||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
||||
|
||||
// same as tok_embd, duplicated to allow offloading
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||
if (!model.output) {
|
||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||
ml.n_created--; // artificial tensor
|
||||
ml.size_data += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue