Merge branch 'ggerganov:master' into iq2_s

This commit is contained in:
Abhilash Majumder 2024-03-19 14:08:56 +05:30 committed by GitHub
commit 4b7aaae8f3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 22 additions and 15 deletions

View file

@ -12,6 +12,7 @@ jobs:
steps:
- uses: actions/stale@v5
with:
exempt-issue-labels: "refactor,help wanted,good first issue,research"
days-before-issue-stale: 30
days-before-issue-close: 14
stale-issue-label: "stale"

View file

@ -1056,7 +1056,8 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int
return true;
}
if (arg == "-h" || arg == "--help") {
return false;
gpt_print_usage(argc, argv, gpt_params());
exit(0);
}
if (arg == "--version") {
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);

View file

@ -32,13 +32,13 @@ typedef struct llama_sampling_params {
float dynatemp_range = 0.00f; // 0.0 = disabled
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat = 1.10f; // 1.0 = disabled
float penalty_repeat = 1.00f; // 1.0 = disabled
float penalty_freq = 0.00f; // 0.0 = disabled
float penalty_present = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
bool penalize_nl = true; // consider newlines as a repeatable token
bool penalize_nl = false; // consider newlines as a repeatable token
std::vector<llama_sampler_type> samplers_sequence = {
llama_sampler_type::TOP_K,

View file

@ -497,7 +497,6 @@ struct clip_ctx {
// memory buffers to evaluate the model
ggml_backend_buffer_t params_buffer = NULL;
ggml_backend_buffer_t compute_buffer = NULL;
ggml_backend_t backend = NULL;
ggml_gallocr_t compute_alloc = NULL;
@ -1676,6 +1675,9 @@ void clip_free(clip_ctx * ctx) {
ggml_free(ctx->ctx_data);
gguf_free(ctx->ctx_gguf);
ggml_backend_buffer_free(ctx->params_buffer);
ggml_backend_free(ctx->backend);
ggml_gallocr_free(ctx->compute_alloc);
delete ctx;
}

6
flake.lock generated
View file

@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1709703039,
"narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=",
"lastModified": 1710451336,
"narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d",
"rev": "d691274a972b3165335d261cc4671335f5c67de9",
"type": "github"
},
"original": {

View file

@ -1015,7 +1015,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
#endif
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
#define GGML_SCHED_MAX_SPLIT_INPUTS 4
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
#endif
#ifndef GGML_SCHED_MAX_COPIES

View file

@ -540,6 +540,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output"},
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
@ -4300,9 +4301,9 @@ static bool llm_load_tensors(
{
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
} else {
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
if (!model.output) {
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
@ -4507,10 +4508,12 @@ static bool llm_load_tensors(
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
// same as tok_embd, duplicated to allow offloading
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
if (!model.output) {
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
ml.n_created--; // artificial tensor
ml.size_data += ggml_nbytes(model.output);
}
}
for (int i = 0; i < n_layer; ++i) {