Merge branch 'master' into concedo_experimental
# Conflicts: # README.md # ggml-opencl.c
This commit is contained in:
commit
c9eb2ba1c5
9 changed files with 122 additions and 52 deletions
18
.clang-tidy
Normal file
18
.clang-tidy
Normal file
|
@ -0,0 +1,18 @@
|
|||
---
|
||||
Checks: >
|
||||
bugprone-*,
|
||||
-bugprone-easily-swappable-parameters,
|
||||
-bugprone-implicit-widening-of-multiplication-result,
|
||||
-bugprone-narrowing-conversions,
|
||||
readability-*,
|
||||
-readability-avoid-unconditional-preprocessor-if,
|
||||
-readability-function-cognitive-complexity,
|
||||
-readability-identifier-length,
|
||||
-readability-implicit-bool-conversion,
|
||||
-readability-magic-numbers,
|
||||
-readability-uppercase-literal-suffix,
|
||||
clang-analyzer-*,
|
||||
-clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
|
||||
performance-*,
|
||||
portability-*,
|
||||
FormatStyle: none
|
20
.github/workflows/tidy-post.yml
vendored
Normal file
20
.github/workflows/tidy-post.yml
vendored
Normal file
|
@ -0,0 +1,20 @@
|
|||
name: clang-tidy review post comments
|
||||
|
||||
on:
|
||||
workflow_run:
|
||||
workflows: ["clang-tidy-review"]
|
||||
types:
|
||||
- completed
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: ZedThree/clang-tidy-review/post@v0.13.0
|
||||
# lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
|
||||
with:
|
||||
# adjust options as necessary
|
||||
lgtm_comment_body: ''
|
||||
annotations: false
|
||||
max_comments: 25
|
23
.github/workflows/tidy-review.yml
vendored
Normal file
23
.github/workflows/tidy-review.yml
vendored
Normal file
|
@ -0,0 +1,23 @@
|
|||
name: clang-tidy-review
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
|
||||
jobs:
|
||||
clang-tidy-review:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- uses: ZedThree/clang-tidy-review@v0.13.0
|
||||
id: review
|
||||
with:
|
||||
lgtm_comment_body: ''
|
||||
build_dir: build
|
||||
cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
|
||||
split_workflow: true
|
||||
|
||||
- uses: ZedThree/clang-tidy-review/upload@v0.13.0
|
|
@ -91,9 +91,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
bool escape_prompt = false;
|
||||
std::string arg;
|
||||
gpt_params default_params;
|
||||
const std::string arg_prefix = "--";
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
||||
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||
}
|
||||
|
||||
if (arg == "-s" || arg == "--seed") {
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
|
@ -141,27 +145,27 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
if (params.prompt.back() == '\n') {
|
||||
params.prompt.pop_back();
|
||||
}
|
||||
} else if (arg == "-n" || arg == "--n_predict") {
|
||||
} else if (arg == "-n" || arg == "--n-predict") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_predict = std::stoi(argv[i]);
|
||||
} else if (arg == "--top_k") {
|
||||
} else if (arg == "--top-k") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.top_k = std::stoi(argv[i]);
|
||||
} else if (arg == "-c" || arg == "--ctx_size") {
|
||||
} else if (arg == "-c" || arg == "--ctx-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
} else if (arg == "--memory_f32") {
|
||||
} else if (arg == "--memory-f32") {
|
||||
params.memory_f16 = false;
|
||||
} else if (arg == "--top_p") {
|
||||
} else if (arg == "--top-p") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
|
@ -185,25 +189,25 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.typical_p = std::stof(argv[i]);
|
||||
} else if (arg == "--repeat_last_n") {
|
||||
} else if (arg == "--repeat-last-n") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.repeat_last_n = std::stoi(argv[i]);
|
||||
} else if (arg == "--repeat_penalty") {
|
||||
} else if (arg == "--repeat-penalty") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.repeat_penalty = std::stof(argv[i]);
|
||||
} else if (arg == "--frequency_penalty") {
|
||||
} else if (arg == "--frequency-penalty") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.frequency_penalty = std::stof(argv[i]);
|
||||
} else if (arg == "--presence_penalty") {
|
||||
} else if (arg == "--presence-penalty") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
|
@ -215,19 +219,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.mirostat = std::stoi(argv[i]);
|
||||
} else if (arg == "--mirostat_lr") {
|
||||
} else if (arg == "--mirostat-lr") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.mirostat_eta = std::stof(argv[i]);
|
||||
} else if (arg == "--mirostat_ent") {
|
||||
} else if (arg == "--mirostat-ent") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.mirostat_tau = std::stof(argv[i]);
|
||||
} else if (arg == "-b" || arg == "--batch_size") {
|
||||
} else if (arg == "-b" || arg == "--batch-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
|
@ -310,7 +314,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} else if (arg == "--n_parts") {
|
||||
} else if (arg == "--n-parts") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
|
@ -384,31 +388,31 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
|
||||
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
||||
fprintf(stderr, " prompt file to start generation.\n");
|
||||
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
|
||||
fprintf(stderr, " --top_k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
||||
fprintf(stderr, " --top_p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
||||
fprintf(stderr, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
|
||||
fprintf(stderr, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
||||
fprintf(stderr, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
||||
fprintf(stderr, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
||||
fprintf(stderr, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
|
||||
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
|
||||
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
||||
fprintf(stderr, " --presence_penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
|
||||
fprintf(stderr, " --frequency_penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
|
||||
fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
|
||||
fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
||||
fprintf(stderr, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
|
||||
fprintf(stderr, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
|
||||
fprintf(stderr, " --mirostat N use Mirostat sampling.\n");
|
||||
fprintf(stderr, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
||||
fprintf(stderr, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
|
||||
fprintf(stderr, " --mirostat_lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
||||
fprintf(stderr, " --mirostat_ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
||||
fprintf(stderr, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
||||
fprintf(stderr, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
||||
fprintf(stderr, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
||||
fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n");
|
||||
fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
||||
fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
|
||||
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||
fprintf(stderr, " --no-penalize-nl do not penalize newline token\n");
|
||||
fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
|
||||
fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value\n");
|
||||
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
|
||||
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --n-parts N number of model parts (default: -1 = determine from dimensions)\n");
|
||||
fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
if (llama_mlock_supported()) {
|
||||
|
|
|
@ -56,9 +56,6 @@ int main(int argc, char ** argv) {
|
|||
// tokenize the prompt
|
||||
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
// determine newline token
|
||||
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
|
||||
|
||||
if (params.verbose_prompt) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
|
|
|
@ -121,7 +121,7 @@ int main(int argc, char ** argv) {
|
|||
// uncomment the "used_mem" line in llama.cpp to see the results
|
||||
if (params.mem_test) {
|
||||
{
|
||||
const std::vector<llama_token> tmp(params.n_batch, 0);
|
||||
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||
}
|
||||
|
||||
|
|
|
@ -358,4 +358,4 @@ void ggml_cl_sgemm_wrapper(
|
|||
clWaitForEvents(1, &ev_c);
|
||||
clReleaseEvent(ev_sgemm);
|
||||
clReleaseEvent(ev_c);
|
||||
}
|
||||
}
|
||||
|
|
48
llama.cpp
48
llama.cpp
|
@ -2466,8 +2466,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|||
}
|
||||
|
||||
// Copies the state to the specified destination address
|
||||
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
||||
uint8_t * out = dest;
|
||||
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||
uint8_t * out = dst;
|
||||
|
||||
// copy rng
|
||||
{
|
||||
|
@ -2527,7 +2527,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|||
|
||||
if (kv_size) {
|
||||
const size_t elt_size = ggml_element_size(kv_self.k);
|
||||
|
||||
char buffer[4096];
|
||||
|
||||
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
||||
ggml_cgraph gf{};
|
||||
gf.n_threads = 1;
|
||||
|
@ -2551,10 +2553,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||
ggml_graph_compute(cpy_ctx, &gf);
|
||||
|
||||
ggml_free(cpy_ctx);
|
||||
}
|
||||
}
|
||||
|
||||
const size_t written = out - dest;
|
||||
const size_t written = out - dst;
|
||||
const size_t max_size = llama_get_state_size(ctx);
|
||||
|
||||
LLAMA_ASSERT(written <= max_size);
|
||||
|
@ -2564,15 +2568,15 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
|||
|
||||
// Sets the state reading from the specified source address
|
||||
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||
const uint8_t * in = src;
|
||||
const uint8_t * inp = src;
|
||||
|
||||
// set rng
|
||||
{
|
||||
size_t rng_size;
|
||||
char rng_buf[LLAMA_MAX_RNG_STATE];
|
||||
|
||||
memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
|
||||
memcpy(&rng_buf[0], in, LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
|
||||
memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
|
||||
memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
|
||||
|
||||
std::stringstream rng_ss;
|
||||
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
||||
|
@ -2586,30 +2590,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|||
size_t logits_cap;
|
||||
size_t logits_size;
|
||||
|
||||
memcpy(&logits_cap, in, sizeof(logits_cap)); in += sizeof(logits_cap);
|
||||
memcpy(&logits_size, in, sizeof(logits_size)); in += sizeof(logits_size);
|
||||
memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
|
||||
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
||||
|
||||
LLAMA_ASSERT(ctx->logits.capacity() == logits_cap);
|
||||
|
||||
if (logits_size) {
|
||||
ctx->logits.resize(logits_size);
|
||||
memcpy(ctx->logits.data(), in, logits_size * sizeof(float));
|
||||
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
||||
}
|
||||
|
||||
in += logits_cap * sizeof(float);
|
||||
inp += logits_cap * sizeof(float);
|
||||
}
|
||||
|
||||
// set embeddings
|
||||
{
|
||||
size_t embedding_size;
|
||||
|
||||
memcpy(&embedding_size, in, sizeof(embedding_size)); in += sizeof(embedding_size);
|
||||
memcpy(&embedding_size, inp, sizeof(embedding_size)); inp += sizeof(embedding_size);
|
||||
|
||||
LLAMA_ASSERT(ctx->embedding.capacity() == embedding_size);
|
||||
|
||||
if (embedding_size) {
|
||||
memcpy(ctx->embedding.data(), in, embedding_size * sizeof(float));
|
||||
in += embedding_size * sizeof(float);
|
||||
memcpy(ctx->embedding.data(), inp, embedding_size * sizeof(float));
|
||||
inp += embedding_size * sizeof(float);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2624,25 +2628,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|||
size_t kv_size;
|
||||
int kv_ntok;
|
||||
|
||||
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
|
||||
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
||||
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
||||
memcpy(&kv_ntok, inp, sizeof(kv_ntok)); inp += sizeof(kv_ntok);
|
||||
|
||||
if (kv_size) {
|
||||
LLAMA_ASSERT(kv_self.buf.size == kv_size);
|
||||
|
||||
const size_t elt_size = ggml_element_size(kv_self.k);
|
||||
|
||||
char buffer[4096];
|
||||
|
||||
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
||||
ggml_cgraph gf{};
|
||||
gf.n_threads = 1;
|
||||
|
||||
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||
kin3d->data = (void *) in;
|
||||
in += ggml_nbytes(kin3d);
|
||||
kin3d->data = (void *) inp;
|
||||
inp += ggml_nbytes(kin3d);
|
||||
|
||||
ggml_tensor * vin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.v->type, kv_ntok, n_embd, n_layer);
|
||||
vin3d->data = (void *) in;
|
||||
in += ggml_nbytes(vin3d);
|
||||
vin3d->data = (void *) inp;
|
||||
inp += ggml_nbytes(vin3d);
|
||||
|
||||
ggml_tensor * k3d = ggml_view_3d(cpy_ctx, kv_self.k,
|
||||
n_embd, kv_ntok, n_layer,
|
||||
|
@ -2655,12 +2661,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
|||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
||||
ggml_graph_compute(cpy_ctx, &gf);
|
||||
|
||||
ggml_free(cpy_ctx);
|
||||
}
|
||||
|
||||
ctx->model.kv_self.n = kv_ntok;
|
||||
}
|
||||
|
||||
const size_t nread = in - src;
|
||||
const size_t nread = inp - src;
|
||||
const size_t max_size = llama_get_state_size(ctx);
|
||||
|
||||
LLAMA_ASSERT(nread <= max_size);
|
||||
|
|
2
llama.h
2
llama.h
|
@ -134,7 +134,7 @@ extern "C" {
|
|||
// Copies the state to the specified destination address.
|
||||
// Destination needs to have allocated enough memory.
|
||||
// Returns the number of bytes copied
|
||||
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
|
||||
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
|
||||
|
||||
// Set the state reading from the specified address
|
||||
// Returns the number of bytes read
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue