Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
d6961d4c48
12 changed files with 162 additions and 90 deletions
9
.github/workflows/server.yml
vendored
9
.github/workflows/server.yml
vendored
|
@ -3,6 +3,11 @@ name: Server
|
||||||
|
|
||||||
on:
|
on:
|
||||||
workflow_dispatch: # allows manual triggering
|
workflow_dispatch: # allows manual triggering
|
||||||
|
inputs:
|
||||||
|
slow_tests:
|
||||||
|
description: 'Run slow tests'
|
||||||
|
required: true
|
||||||
|
type: boolean
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
|
@ -11,7 +16,7 @@ on:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
|
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '00 0 * * *'
|
- cron: '0 0 * * *'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
server:
|
server:
|
||||||
|
@ -80,7 +85,7 @@ jobs:
|
||||||
|
|
||||||
- name: Slow tests
|
- name: Slow tests
|
||||||
id: server_integration_tests_slow
|
id: server_integration_tests_slow
|
||||||
if: github.event.schedule != ''
|
if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }}
|
||||||
run: |
|
run: |
|
||||||
cd examples/server/tests
|
cd examples/server/tests
|
||||||
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
|
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
|
||||||
|
|
|
@ -8,6 +8,10 @@
|
||||||
|
|
||||||
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
|
||||||
|
|
||||||
|
### Recent API changes
|
||||||
|
|
||||||
|
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761))
|
- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761))
|
||||||
|
|
|
@ -335,6 +335,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.yarn_beta_slow = std::stof(argv[i]);
|
params.yarn_beta_slow = std::stof(argv[i]);
|
||||||
|
} else if (arg == "--pooling") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
std::string value(argv[i]);
|
||||||
|
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
|
||||||
|
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
|
||||||
|
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
|
||||||
|
else { invalid_param = true; break; }
|
||||||
} else if (arg == "--defrag-thold" || arg == "-dt") {
|
} else if (arg == "--defrag-thold" || arg == "-dt") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -1014,6 +1024,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
|
||||||
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
|
||||||
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
|
||||||
|
printf(" --pooling {none,mean,cls}\n");
|
||||||
|
printf(" pooling type for embeddings, use model default if unspecified\n");
|
||||||
printf(" -dt N, --defrag-thold N\n");
|
printf(" -dt N, --defrag-thold N\n");
|
||||||
printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
|
printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
|
||||||
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||||
|
@ -1296,6 +1308,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
||||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||||
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
|
||||||
|
cparams.pooling_type = params.pooling_type;
|
||||||
cparams.defrag_thold = params.defrag_thold;
|
cparams.defrag_thold = params.defrag_thold;
|
||||||
cparams.offload_kqv = !params.no_kv_offload;
|
cparams.offload_kqv = !params.no_kv_offload;
|
||||||
|
|
||||||
|
|
|
@ -76,8 +76,11 @@ struct gpt_params {
|
||||||
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
float yarn_beta_slow = 1.0f; // YaRN high correction dim
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||||
int32_t rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
|
||||||
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
|
||||||
|
|
||||||
|
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
||||||
|
llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
|
|
||||||
// // sampling parameters
|
// // sampling parameters
|
||||||
struct llama_sampling_params sparams;
|
struct llama_sampling_params sparams;
|
||||||
|
|
|
@ -1644,16 +1644,17 @@ class BertModel(Model):
|
||||||
self.gguf_writer.add_causal_attention(False)
|
self.gguf_writer.add_causal_attention(False)
|
||||||
|
|
||||||
# get pooling path
|
# get pooling path
|
||||||
with open(self.dir_model / "modules.json", encoding="utf-8") as f:
|
|
||||||
modules = json.load(f)
|
|
||||||
pooling_path = None
|
pooling_path = None
|
||||||
for mod in modules:
|
module_path = self.dir_model / "modules.json"
|
||||||
if mod["type"] == "sentence_transformers.models.Pooling":
|
if module_path.is_file():
|
||||||
pooling_path = mod["path"]
|
with open(module_path, encoding="utf-8") as f:
|
||||||
break
|
modules = json.load(f)
|
||||||
|
for mod in modules:
|
||||||
|
if mod["type"] == "sentence_transformers.models.Pooling":
|
||||||
|
pooling_path = mod["path"]
|
||||||
|
break
|
||||||
|
|
||||||
# get pooling type
|
# get pooling type
|
||||||
pooling_type = gguf.PoolingType.NONE
|
|
||||||
if pooling_path is not None:
|
if pooling_path is not None:
|
||||||
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
|
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
|
||||||
pooling = json.load(f)
|
pooling = json.load(f)
|
||||||
|
@ -1663,8 +1664,7 @@ class BertModel(Model):
|
||||||
pooling_type = gguf.PoolingType.CLS
|
pooling_type = gguf.PoolingType.CLS
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError("Only MEAN and CLS pooling types supported")
|
raise NotImplementedError("Only MEAN and CLS pooling types supported")
|
||||||
|
self.gguf_writer.add_pooling_type(pooling_type)
|
||||||
self.gguf_writer.add_pooling_type(pooling_type)
|
|
||||||
|
|
||||||
def set_vocab(self):
|
def set_vocab(self):
|
||||||
path = self.dir_model
|
path = self.dir_model
|
||||||
|
|
|
@ -18,7 +18,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
|
|
||||||
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
||||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||||
- `--threads-http N`: number of threads in the http server pool to process requests (default: `std::thread::hardware_concurrency()`)
|
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
|
||||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||||
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||||
|
|
|
@ -2027,7 +2027,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
printf(" --threads-http N number of threads in the http server pool to process requests (default: hardware concurrency)\n");
|
printf(" --threads-http N number of threads in the http server pool to process requests (default: max(hardware concurrency - 1, --parallel N + 2))\n");
|
||||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
printf(" --rope-scaling {none,linear,yarn}\n");
|
printf(" --rope-scaling {none,linear,yarn}\n");
|
||||||
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
|
||||||
|
@ -3494,10 +3494,12 @@ int main(int argc, char **argv)
|
||||||
}*/
|
}*/
|
||||||
//);
|
//);
|
||||||
|
|
||||||
if (sparams.n_threads_http > 0) {
|
if (sparams.n_threads_http < 1) {
|
||||||
log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
|
// +2 threads for monitoring endpoints
|
||||||
svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
|
sparams.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
|
||||||
}
|
}
|
||||||
|
log_data["n_threads_http"] = std::to_string(sparams.n_threads_http);
|
||||||
|
svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); };
|
||||||
|
|
||||||
LOG_INFO("HTTP server listening", log_data);
|
LOG_INFO("HTTP server listening", log_data);
|
||||||
// run the HTTP server in a thread - see comment below
|
// run the HTTP server in a thread - see comment below
|
||||||
|
|
18
flake.lock
generated
18
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1706830856,
|
"lastModified": 1709336216,
|
||||||
"narHash": "sha256-a0NYyp+h9hlb7ddVz4LUn1vT/PLwqfrWYcHMvFB1xYg=",
|
"narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "b253292d9c0a5ead9bc98c4e9a26c6312e27d69f",
|
"rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1708655239,
|
"lastModified": 1709237383,
|
||||||
"narHash": "sha256-ZrP/yACUvDB+zbqYJsln4iwotbH6CTZiTkANJ0AgDv4=",
|
"narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "cbc4211f0afffe6dfd2478a62615dd5175a13f9a",
|
"rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -37,11 +37,11 @@
|
||||||
"nixpkgs-lib": {
|
"nixpkgs-lib": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"dir": "lib",
|
"dir": "lib",
|
||||||
"lastModified": 1706550542,
|
"lastModified": 1709237383,
|
||||||
"narHash": "sha256-UcsnCG6wx++23yeER4Hg18CXWbgNpqNXcHIo5/1Y+hc=",
|
"narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "97b17f32362e475016f942bbdfda4a4a72a8a652",
|
"rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
@ -6904,6 +6904,7 @@ static __global__ void soft_max_f32(const float * x, const float * mask, const f
|
||||||
// find the sum of exps in the block
|
// find the sum of exps in the block
|
||||||
tmp = warp_reduce_sum(tmp);
|
tmp = warp_reduce_sum(tmp);
|
||||||
if (block_size > WARP_SIZE) {
|
if (block_size > WARP_SIZE) {
|
||||||
|
__syncthreads();
|
||||||
if (warp_id == 0) {
|
if (warp_id == 0) {
|
||||||
buf_iw[lane_id] = 0.0f;
|
buf_iw[lane_id] = 0.0f;
|
||||||
}
|
}
|
||||||
|
|
|
@ -604,20 +604,28 @@ class PoolingType(IntEnum):
|
||||||
|
|
||||||
|
|
||||||
class GGMLQuantizationType(IntEnum):
|
class GGMLQuantizationType(IntEnum):
|
||||||
F32 = 0
|
F32 = 0
|
||||||
F16 = 1
|
F16 = 1
|
||||||
Q4_0 = 2
|
Q4_0 = 2
|
||||||
Q4_1 = 3
|
Q4_1 = 3
|
||||||
Q5_0 = 6
|
Q5_0 = 6
|
||||||
Q5_1 = 7
|
Q5_1 = 7
|
||||||
Q8_0 = 8
|
Q8_0 = 8
|
||||||
Q8_1 = 9
|
Q8_1 = 9
|
||||||
Q2_K = 10
|
Q2_K = 10
|
||||||
Q3_K = 11
|
Q3_K = 11
|
||||||
Q4_K = 12
|
Q4_K = 12
|
||||||
Q5_K = 13
|
Q5_K = 13
|
||||||
Q6_K = 14
|
Q6_K = 14
|
||||||
Q8_K = 15
|
Q8_K = 15
|
||||||
|
IQ2_XXS = 16
|
||||||
|
IQ2_XS = 17
|
||||||
|
IQ3_XXS = 18
|
||||||
|
IQ1_S = 19
|
||||||
|
IQ4_NL = 20
|
||||||
|
IQ3_S = 21
|
||||||
|
IQ2_S = 22
|
||||||
|
IQ4_XS = 23
|
||||||
|
|
||||||
|
|
||||||
class GGUFEndian(IntEnum):
|
class GGUFEndian(IntEnum):
|
||||||
|
@ -662,20 +670,28 @@ class GGUFValueType(IntEnum):
|
||||||
QK_K = 256
|
QK_K = 256
|
||||||
# Items here are (block size, type size)
|
# Items here are (block size, type size)
|
||||||
GGML_QUANT_SIZES = {
|
GGML_QUANT_SIZES = {
|
||||||
GGMLQuantizationType.F32: (1, 4),
|
GGMLQuantizationType.F32: (1, 4),
|
||||||
GGMLQuantizationType.F16: (1, 2),
|
GGMLQuantizationType.F16: (1, 2),
|
||||||
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
||||||
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
|
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
|
||||||
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
|
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
|
||||||
GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
|
GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
|
||||||
GGMLQuantizationType.Q8_0: (32, 2 + 32),
|
GGMLQuantizationType.Q8_0: (32, 2 + 32),
|
||||||
GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
|
GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
|
||||||
GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
||||||
GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
||||||
GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
|
GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
|
||||||
GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
||||||
GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
||||||
GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
|
GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
|
||||||
|
GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4),
|
||||||
|
GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32),
|
||||||
|
GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8),
|
||||||
|
GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16),
|
||||||
|
GGMLQuantizationType.IQ4_NL: (32, 2 + 16),
|
||||||
|
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
|
||||||
|
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
|
||||||
|
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
91
llama.cpp
91
llama.cpp
|
@ -873,16 +873,16 @@ struct LLM_TN {
|
||||||
// gguf helpers
|
// gguf helpers
|
||||||
//
|
//
|
||||||
|
|
||||||
static const std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
||||||
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
||||||
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
||||||
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
||||||
};
|
};
|
||||||
|
|
||||||
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
||||||
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
||||||
if (kv.second == name) {
|
if (kv.second == name) {
|
||||||
return kv.first;
|
return (llama_rope_scaling_type) kv.first;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1612,7 +1612,6 @@ struct llama_hparams {
|
||||||
float rope_freq_base_train;
|
float rope_freq_base_train;
|
||||||
float rope_freq_scale_train;
|
float rope_freq_scale_train;
|
||||||
uint32_t n_yarn_orig_ctx;
|
uint32_t n_yarn_orig_ctx;
|
||||||
int32_t rope_scaling_type_train;
|
|
||||||
|
|
||||||
float f_clamp_kqv = 0.0f;
|
float f_clamp_kqv = 0.0f;
|
||||||
float f_max_alibi_bias = 0.0f;
|
float f_max_alibi_bias = 0.0f;
|
||||||
|
@ -1620,8 +1619,9 @@ struct llama_hparams {
|
||||||
bool causal_attn = true;
|
bool causal_attn = true;
|
||||||
bool need_kq_pos = false;
|
bool need_kq_pos = false;
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||||
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
||||||
|
enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
if (this->vocab_only != other.vocab_only) return true;
|
if (this->vocab_only != other.vocab_only) return true;
|
||||||
|
@ -1670,8 +1670,8 @@ struct llama_cparams {
|
||||||
uint32_t n_threads; // number of threads to use for generation
|
uint32_t n_threads; // number of threads to use for generation
|
||||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||||
|
|
||||||
float rope_freq_base;
|
float rope_freq_base;
|
||||||
float rope_freq_scale;
|
float rope_freq_scale;
|
||||||
|
|
||||||
uint32_t n_yarn_orig_ctx;
|
uint32_t n_yarn_orig_ctx;
|
||||||
// These hyperparameters are not exposed in GGUF, because all
|
// These hyperparameters are not exposed in GGUF, because all
|
||||||
|
@ -1683,7 +1683,7 @@ struct llama_cparams {
|
||||||
float defrag_thold;
|
float defrag_thold;
|
||||||
|
|
||||||
bool offload_kqv;
|
bool offload_kqv;
|
||||||
bool do_pooling;
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval;
|
ggml_backend_sched_eval_callback cb_eval;
|
||||||
void * cb_eval_user_data;
|
void * cb_eval_user_data;
|
||||||
|
@ -2156,10 +2156,12 @@ static bool llama_kv_cache_find_slot(
|
||||||
}
|
}
|
||||||
|
|
||||||
// find how many cells are currently in use
|
// find how many cells are currently in use
|
||||||
static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
static uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
||||||
for (uint32_t i = cache.size - 1; i > 0; --i) {
|
for (uint32_t i = cache.size; i > 0; --i) {
|
||||||
if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
|
const llama_kv_cell & cell = cache.cells[i - 1];
|
||||||
return i + 1;
|
|
||||||
|
if (cell.pos >= 0 && !cell.is_empty()) {
|
||||||
|
return i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2931,7 +2933,11 @@ template<>
|
||||||
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
|
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
|
||||||
uint32_t tmp;
|
uint32_t tmp;
|
||||||
const bool found = get_key(kid, tmp, required);
|
const bool found = get_key(kid, tmp, required);
|
||||||
result = (enum llama_pooling_type) tmp;
|
if (found) {
|
||||||
|
result = (enum llama_pooling_type) tmp;
|
||||||
|
} else {
|
||||||
|
result = LLAMA_POOLING_TYPE_UNSPECIFIED;
|
||||||
|
}
|
||||||
return found;
|
return found;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3208,7 +3214,7 @@ static void llm_load_hparams(
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||||
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
||||||
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||||
|
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 3:
|
case 3:
|
||||||
|
@ -5173,7 +5179,7 @@ struct llm_build_context {
|
||||||
n_kv (worst_case ? n_ctx : kv_self.n),
|
n_kv (worst_case ? n_ctx : kv_self.n),
|
||||||
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
||||||
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
||||||
pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
|
pooling_type (cparams.pooling_type),
|
||||||
rope_type (hparams.rope_type),
|
rope_type (hparams.rope_type),
|
||||||
cb (cb),
|
cb (cb),
|
||||||
buf_compute_meta (lctx.buf_compute_meta) {
|
buf_compute_meta (lctx.buf_compute_meta) {
|
||||||
|
@ -8013,7 +8019,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
||||||
|
@ -8041,7 +8047,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
||||||
const int64_t n_tokens = batch.n_tokens;
|
const int64_t n_tokens = batch.n_tokens;
|
||||||
|
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
||||||
|
@ -8178,7 +8184,7 @@ static int llama_decode_internal(
|
||||||
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
||||||
// after enough generations, the benefit from this heuristic disappears
|
// after enough generations, the benefit from this heuristic disappears
|
||||||
// if we start defragmenting the cache, the benefit from this will be more important
|
// if we start defragmenting the cache, the benefit from this will be more important
|
||||||
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
||||||
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
||||||
|
|
||||||
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||||
|
@ -11844,6 +11850,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
||||||
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
||||||
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
||||||
|
/*.pooling_type =*/ LLAMA_POOLING_TYPE_UNSPECIFIED,
|
||||||
/*.rope_freq_base =*/ 0.0f,
|
/*.rope_freq_base =*/ 0.0f,
|
||||||
/*.rope_freq_scale =*/ 0.0f,
|
/*.rope_freq_scale =*/ 0.0f,
|
||||||
/*.yarn_ext_factor =*/ -1.0f,
|
/*.yarn_ext_factor =*/ -1.0f,
|
||||||
|
@ -11859,7 +11866,6 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.logits_all =*/ false,
|
/*.logits_all =*/ false,
|
||||||
/*.embedding =*/ false,
|
/*.embedding =*/ false,
|
||||||
/*.offload_kqv =*/ true,
|
/*.offload_kqv =*/ true,
|
||||||
/*.do_pooling =*/ true,
|
|
||||||
/*.abort_callback =*/ nullptr,
|
/*.abort_callback =*/ nullptr,
|
||||||
/*.abort_callback_data =*/ nullptr,
|
/*.abort_callback_data =*/ nullptr,
|
||||||
};
|
};
|
||||||
|
@ -12010,7 +12016,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
||||||
cparams.defrag_thold = params.defrag_thold;
|
cparams.defrag_thold = params.defrag_thold;
|
||||||
cparams.offload_kqv = params.offload_kqv;
|
cparams.offload_kqv = params.offload_kqv;
|
||||||
cparams.do_pooling = params.do_pooling;
|
cparams.pooling_type = params.pooling_type;
|
||||||
|
|
||||||
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
||||||
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
||||||
|
@ -12036,6 +12042,14 @@ struct llama_context * llama_new_context_with_model(
|
||||||
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (cparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
||||||
|
if (hparams.pooling_type == LLAMA_POOLING_TYPE_UNSPECIFIED) {
|
||||||
|
cparams.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||||
|
} else {
|
||||||
|
cparams.pooling_type = hparams.pooling_type;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
|
@ -12615,9 +12629,14 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||||
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
||||||
const size_t s_embedding_size = sizeof(size_t);
|
const size_t s_embedding_size = sizeof(size_t);
|
||||||
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
||||||
const size_t s_kv_size = sizeof(size_t);
|
const size_t s_kv_buf_size = sizeof(size_t);
|
||||||
const size_t s_kv_ntok = sizeof(int);
|
const size_t s_kv_head = sizeof(uint32_t);
|
||||||
|
const size_t s_kv_size = sizeof(uint32_t);
|
||||||
|
const size_t s_kv_used = sizeof(uint32_t);
|
||||||
const size_t s_kv = ctx->kv_self.total_size();
|
const size_t s_kv = ctx->kv_self.total_size();
|
||||||
|
// TODO: assume the max is more than 1 seq_id per KV cell
|
||||||
|
const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id);
|
||||||
|
const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell;
|
||||||
|
|
||||||
const size_t s_total = (
|
const size_t s_total = (
|
||||||
+ s_rng_size
|
+ s_rng_size
|
||||||
|
@ -12626,9 +12645,12 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||||
+ s_logits
|
+ s_logits
|
||||||
+ s_embedding_size
|
+ s_embedding_size
|
||||||
+ s_embedding
|
+ s_embedding
|
||||||
|
+ s_kv_buf_size
|
||||||
|
+ s_kv_head
|
||||||
+ s_kv_size
|
+ s_kv_size
|
||||||
+ s_kv_ntok
|
+ s_kv_used
|
||||||
+ s_kv
|
+ s_kv
|
||||||
|
+ s_kv_cells
|
||||||
);
|
);
|
||||||
|
|
||||||
return s_total;
|
return s_total;
|
||||||
|
@ -12728,15 +12750,13 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||||
{
|
{
|
||||||
const auto & kv_self = ctx->kv_self;
|
const auto & kv_self = ctx->kv_self;
|
||||||
const auto & hparams = ctx->model.hparams;
|
const auto & hparams = ctx->model.hparams;
|
||||||
const auto & cparams = ctx->cparams;
|
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||||
const uint32_t n_ctx = cparams.n_ctx;
|
|
||||||
|
|
||||||
const size_t kv_buf_size = kv_self.total_size();
|
const size_t kv_buf_size = kv_self.total_size();
|
||||||
const uint32_t kv_head = kv_self.head;
|
const uint32_t kv_head = llama_kv_cache_cell_max(kv_self);
|
||||||
const uint32_t kv_size = kv_self.size;
|
const uint32_t kv_size = kv_self.size;
|
||||||
const uint32_t kv_used = kv_self.used;
|
const uint32_t kv_used = kv_self.used;
|
||||||
|
|
||||||
|
@ -12756,7 +12776,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||||
|
|
||||||
// v is not contiguous, copy row by row
|
// v is not contiguous, copy row by row
|
||||||
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
||||||
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
||||||
|
|
||||||
tmp_buf.resize(v_row_size);
|
tmp_buf.resize(v_row_size);
|
||||||
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
||||||
|
@ -12766,7 +12786,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t i = 0; i < kv_size; ++i) {
|
for (uint32_t i = 0; i < kv_head; ++i) {
|
||||||
const auto & cell = kv_self.cells[i];
|
const auto & cell = kv_self.cells[i];
|
||||||
|
|
||||||
const llama_pos pos = cell.pos;
|
const llama_pos pos = cell.pos;
|
||||||
|
@ -12842,12 +12862,10 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||||
{
|
{
|
||||||
const auto & kv_self = ctx->kv_self;
|
const auto & kv_self = ctx->kv_self;
|
||||||
const auto & hparams = ctx->model.hparams;
|
const auto & hparams = ctx->model.hparams;
|
||||||
const auto & cparams = ctx->cparams;
|
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
||||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
||||||
const uint32_t n_ctx = cparams.n_ctx;
|
|
||||||
|
|
||||||
size_t kv_buf_size;
|
size_t kv_buf_size;
|
||||||
uint32_t kv_head;
|
uint32_t kv_head;
|
||||||
|
@ -12870,7 +12888,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||||
|
|
||||||
// v is not contiguous, copy row by row
|
// v is not contiguous, copy row by row
|
||||||
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
||||||
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
|
||||||
|
|
||||||
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
||||||
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
||||||
|
@ -12879,13 +12897,15 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_ASSERT(kv_self.size == kv_size);
|
||||||
|
|
||||||
ctx->kv_self.head = kv_head;
|
ctx->kv_self.head = kv_head;
|
||||||
ctx->kv_self.size = kv_size;
|
ctx->kv_self.size = kv_size;
|
||||||
ctx->kv_self.used = kv_used;
|
ctx->kv_self.used = kv_used;
|
||||||
|
|
||||||
ctx->kv_self.cells.resize(kv_size);
|
ctx->kv_self.cells.resize(kv_size);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < kv_size; ++i) {
|
for (uint32_t i = 0; i < kv_head; ++i) {
|
||||||
llama_pos pos;
|
llama_pos pos;
|
||||||
size_t seq_id_size;
|
size_t seq_id_size;
|
||||||
|
|
||||||
|
@ -12901,6 +12921,11 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
||||||
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
ctx->kv_self.cells[i].seq_id.insert(seq_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (uint32_t i = kv_head; i < kv_size; ++i) {
|
||||||
|
ctx->kv_self.cells[i].pos = -1;
|
||||||
|
ctx->kv_self.cells[i].seq_id.clear();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t nread = inp - src;
|
const size_t nread = inp - src;
|
||||||
|
|
7
llama.h
7
llama.h
|
@ -129,6 +129,7 @@ extern "C" {
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_pooling_type {
|
enum llama_pooling_type {
|
||||||
|
LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
|
||||||
LLAMA_POOLING_TYPE_NONE = 0,
|
LLAMA_POOLING_TYPE_NONE = 0,
|
||||||
LLAMA_POOLING_TYPE_MEAN = 1,
|
LLAMA_POOLING_TYPE_MEAN = 1,
|
||||||
LLAMA_POOLING_TYPE_CLS = 2,
|
LLAMA_POOLING_TYPE_CLS = 2,
|
||||||
|
@ -236,7 +237,10 @@ extern "C" {
|
||||||
uint32_t n_batch; // prompt processing maximum batch size
|
uint32_t n_batch; // prompt processing maximum batch size
|
||||||
uint32_t n_threads; // number of threads to use for generation
|
uint32_t n_threads; // number of threads to use for generation
|
||||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||||
int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
|
||||||
|
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||||
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||||
|
// (ignored if no pooling layer)
|
||||||
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
|
@ -258,7 +262,6 @@ extern "C" {
|
||||||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
bool embedding; // embedding mode only
|
bool embedding; // embedding mode only
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer)
|
|
||||||
|
|
||||||
// Abort callback
|
// Abort callback
|
||||||
// if it returns true, execution of llama_decode() will be aborted
|
// if it returns true, execution of llama_decode() will be aborted
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue