Merge branch 'master' of github.com:ggerganov/llama.cpp into mfalcon_mamba_cuda
This commit is contained in:
commit
40f47872b3
11 changed files with 5695 additions and 6059 deletions
|
@ -24,6 +24,8 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
ENV GGML_CUDA=1
|
ENV GGML_CUDA=1
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
|
|
|
@ -26,6 +26,8 @@ RUN apt-get update && \
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,8 @@ ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
ENV GGML_HIPBLAS=1
|
ENV GGML_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
|
@ -23,6 +23,8 @@ RUN cp /app/build/bin/llama-server /llama-server && \
|
||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
|
|
@ -21,6 +21,8 @@ RUN apt-get update && \
|
||||||
COPY --from=build /app/llama-server /llama-server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
|
2
.ecrc
2
.ecrc
|
@ -1,5 +1,5 @@
|
||||||
{
|
{
|
||||||
"Exclude": ["^\\.gitmodules$"],
|
"Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
|
||||||
"Disable": {
|
"Disable": {
|
||||||
"IndentSize": true
|
"IndentSize": true
|
||||||
}
|
}
|
||||||
|
|
|
@ -327,6 +327,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
void gpt_params_parse_from_env(gpt_params & params) {
|
void gpt_params_parse_from_env(gpt_params & params) {
|
||||||
// we only care about server-related params for now
|
// we only care about server-related params for now
|
||||||
get_env("LLAMA_ARG_MODEL", params.model);
|
get_env("LLAMA_ARG_MODEL", params.model);
|
||||||
|
get_env("LLAMA_ARG_MODEL_URL", params.model_url);
|
||||||
|
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
|
||||||
|
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
|
||||||
|
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
|
||||||
get_env("LLAMA_ARG_THREADS", params.n_threads);
|
get_env("LLAMA_ARG_THREADS", params.n_threads);
|
||||||
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
|
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
|
||||||
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
|
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
|
||||||
|
@ -341,6 +345,9 @@ void gpt_params_parse_from_env(gpt_params & params) {
|
||||||
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
|
get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
|
||||||
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
|
get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
|
||||||
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
|
get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
|
||||||
|
get_env("LLAMA_ARG_CONT_BATCHING", params.cont_batching);
|
||||||
|
get_env("LLAMA_ARG_HOST", params.hostname);
|
||||||
|
get_env("LLAMA_ARG_PORT", params.port);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
|
|
11662
common/stb_image.h
11662
common/stb_image.h
File diff suppressed because it is too large
Load diff
|
@ -1572,7 +1572,7 @@ class LlamaModel(Model):
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_scaling.get("factor", 8.0)
|
||||||
|
@ -3820,7 +3820,7 @@ class ExaoneModel(Model):
|
||||||
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
|
||||||
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
if rope_scaling.get("rope_type", '').lower() == "llama3":
|
||||||
base = self.hparams.get("rope_theta", 10000.0)
|
base = self.hparams.get("rope_theta", 10000.0)
|
||||||
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
|
dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"])
|
||||||
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
|
||||||
|
|
||||||
factor = rope_scaling.get("factor", 8.0)
|
factor = rope_scaling.get("factor", 8.0)
|
||||||
|
|
|
@ -249,23 +249,49 @@ logging:
|
||||||
|
|
||||||
Available environment variables (if specified, these variables will override parameters specified in arguments):
|
Available environment variables (if specified, these variables will override parameters specified in arguments):
|
||||||
|
|
||||||
- `LLAMA_CACHE` (cache directory, used by `--hf-repo`)
|
- `LLAMA_CACHE`: cache directory, used by `--hf-repo`
|
||||||
- `HF_TOKEN` (Hugging Face access token, used when accessing a gated model with `--hf-repo`)
|
- `HF_TOKEN`: Hugging Face access token, used when accessing a gated model with `--hf-repo`
|
||||||
- `LLAMA_ARG_MODEL`
|
- `LLAMA_ARG_MODEL`: equivalent to `-m`
|
||||||
- `LLAMA_ARG_THREADS`
|
- `LLAMA_ARG_MODEL_URL`: equivalent to `-mu`
|
||||||
- `LLAMA_ARG_CTX_SIZE`
|
- `LLAMA_ARG_MODEL_ALIAS`: equivalent to `-a`
|
||||||
- `LLAMA_ARG_N_PARALLEL`
|
- `LLAMA_ARG_HF_REPO`: equivalent to `--hf-repo`
|
||||||
- `LLAMA_ARG_BATCH`
|
- `LLAMA_ARG_HF_FILE`: equivalent to `--hf-file`
|
||||||
- `LLAMA_ARG_UBATCH`
|
- `LLAMA_ARG_THREADS`: equivalent to `-t`
|
||||||
- `LLAMA_ARG_N_GPU_LAYERS`
|
- `LLAMA_ARG_CTX_SIZE`: equivalent to `-c`
|
||||||
- `LLAMA_ARG_THREADS_HTTP`
|
- `LLAMA_ARG_N_PARALLEL`: equivalent to `-np`
|
||||||
- `LLAMA_ARG_CHAT_TEMPLATE`
|
- `LLAMA_ARG_BATCH`: equivalent to `-b`
|
||||||
- `LLAMA_ARG_N_PREDICT`
|
- `LLAMA_ARG_UBATCH`: equivalent to `-ub`
|
||||||
- `LLAMA_ARG_ENDPOINT_METRICS`
|
- `LLAMA_ARG_N_GPU_LAYERS`: equivalent to `-ngl`
|
||||||
- `LLAMA_ARG_ENDPOINT_SLOTS`
|
- `LLAMA_ARG_THREADS_HTTP`: equivalent to `--threads-http`
|
||||||
- `LLAMA_ARG_EMBEDDINGS`
|
- `LLAMA_ARG_CHAT_TEMPLATE`: equivalent to `--chat-template`
|
||||||
- `LLAMA_ARG_FLASH_ATTN`
|
- `LLAMA_ARG_N_PREDICT`: equivalent to `-n`
|
||||||
- `LLAMA_ARG_DEFRAG_THOLD`
|
- `LLAMA_ARG_ENDPOINT_METRICS`: if set to `1`, it will enable metrics endpoint (equivalent to `--metrics`)
|
||||||
|
- `LLAMA_ARG_ENDPOINT_SLOTS`: if set to `0`, it will **disable** slots endpoint (equivalent to `--no-slots`). This feature is enabled by default.
|
||||||
|
- `LLAMA_ARG_EMBEDDINGS`: if set to `1`, it will enable embeddings endpoint (equivalent to `--embeddings`)
|
||||||
|
- `LLAMA_ARG_FLASH_ATTN`: if set to `1`, it will enable flash attention (equivalent to `-fa`)
|
||||||
|
- `LLAMA_ARG_CONT_BATCHING`: if set to `0`, it will **disable** continuous batching (equivalent to `--no-cont-batching`). This feature is enabled by default.
|
||||||
|
- `LLAMA_ARG_DEFRAG_THOLD`: equivalent to `-dt`
|
||||||
|
- `LLAMA_ARG_HOST`: equivalent to `--host`
|
||||||
|
- `LLAMA_ARG_PORT`: equivalent to `--port`
|
||||||
|
|
||||||
|
Example usage of docker compose with environment variables:
|
||||||
|
|
||||||
|
```yml
|
||||||
|
services:
|
||||||
|
llamacpp-server:
|
||||||
|
image: ghcr.io/ggerganov/llama.cpp:server
|
||||||
|
ports:
|
||||||
|
- 8080:8080
|
||||||
|
volumes:
|
||||||
|
- ./models:/models
|
||||||
|
environment:
|
||||||
|
# alternatively, you can use "LLAMA_ARG_MODEL_URL" to download the model
|
||||||
|
LLAMA_ARG_MODEL: /models/my_model.gguf
|
||||||
|
LLAMA_ARG_CTX_SIZE: 4096
|
||||||
|
LLAMA_ARG_N_PARALLEL: 2
|
||||||
|
LLAMA_ARG_ENDPOINT_METRICS: 1 # to disable, either remove or set to 0
|
||||||
|
LLAMA_ARG_PORT: 8080
|
||||||
|
```
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
|
|
|
@ -6605,6 +6605,7 @@ static bool llm_load_tensors(
|
||||||
const int64_t n_embd_gqa = n_embd_v_gqa;
|
const int64_t n_embd_gqa = n_embd_v_gqa;
|
||||||
const int64_t n_vocab = hparams.n_vocab;
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
const int64_t n_vocab_type = hparams.n_vocab_type;
|
const int64_t n_vocab_type = hparams.n_vocab_type;
|
||||||
|
const int64_t n_rot = hparams.n_rot;
|
||||||
const int64_t n_expert = hparams.n_expert;
|
const int64_t n_expert = hparams.n_expert;
|
||||||
const int64_t n_expert_used = hparams.n_expert_used;
|
const int64_t n_expert_used = hparams.n_expert_used;
|
||||||
const int64_t n_ctx_train = hparams.n_ctx_train;
|
const int64_t n_ctx_train = hparams.n_ctx_train;
|
||||||
|
@ -6662,7 +6663,7 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||||
|
|
||||||
if (n_expert == 0) {
|
if (n_expert == 0) {
|
||||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
@ -8115,8 +8116,8 @@ static bool llm_load_tensors(
|
||||||
|
|
||||||
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + (hparams.n_embd_head_k << 2)});
|
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
||||||
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + (hparams.n_embd_head_k << 2)});
|
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
||||||
|
|
||||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
|
||||||
|
@ -8193,7 +8194,7 @@ static bool llm_load_tensors(
|
||||||
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd});
|
||||||
|
|
||||||
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
||||||
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_embd/n_head/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
layer.rope_freqs = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ROPE_FREQS, "weight"), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0));
|
||||||
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||||
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue