server: update refs -> llama-server
gitignore llama-server
This commit is contained in:
parent
849842916d
commit
f298cc63d2
16 changed files with 35 additions and 35 deletions
|
@ -25,13 +25,13 @@ ENV LLAMA_CUDA=1
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make -j$(nproc) server
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/server /server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
||||||
|
|
|
@ -38,8 +38,8 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/server /server
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
||||||
|
|
|
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
RUN make -j$(nproc)
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/server" ]
|
ENTRYPOINT [ "/app/llama-server" ]
|
||||||
|
|
|
@ -23,9 +23,9 @@ RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
WORKDIR /
|
WORKDIR /
|
||||||
RUN cp /app/build/bin/server /server && \
|
RUN cp /app/build/bin/llama-server /llama-server && \
|
||||||
rm -rf /app
|
rm -rf /app
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
||||||
|
|
|
@ -11,15 +11,15 @@ COPY . .
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make -j$(nproc) server
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/server /server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
ENTRYPOINT [ "/server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
||||||
|
|
|
@ -26,7 +26,7 @@ elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
|
||||||
./server "$@"
|
./llama-server "$@"
|
||||||
else
|
else
|
||||||
echo "Unknown command: $arg1"
|
echo "Unknown command: $arg1"
|
||||||
echo "Available commands: "
|
echo "Available commands: "
|
||||||
|
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -76,7 +76,7 @@ models-mnt
|
||||||
/quantize-stats
|
/quantize-stats
|
||||||
/result
|
/result
|
||||||
/save-load-state
|
/save-load-state
|
||||||
/server
|
/llama-server
|
||||||
/simple
|
/simple
|
||||||
/batched
|
/batched
|
||||||
/batched-bench
|
/batched-bench
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Usage:
|
# Usage:
|
||||||
#! ./server -m some-model.gguf &
|
#! ./llama-server -m some-model.gguf &
|
||||||
#! pip install pydantic
|
#! pip install pydantic
|
||||||
#! python json-schema-pydantic-example.py
|
#! python json-schema-pydantic-example.py
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 4096 --batch-size 1024}"
|
||||||
|
|
||||||
|
|
||||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
./server $GEN_OPTIONS \
|
./llama-server $GEN_OPTIONS \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--threads "$N_THREAD" \
|
--threads "$N_THREAD" \
|
||||||
--rope-freq-scale 1.0 \
|
--rope-freq-scale 1.0 \
|
||||||
|
|
|
@ -80,26 +80,26 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
`server` is built alongside everything else from the root of the project
|
`llama-server` is built alongside everything else from the root of the project
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make server
|
make llama-server
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build
|
cmake -B build
|
||||||
cmake --build build --config Release -t server
|
cmake --build build --config Release -t llama-server
|
||||||
```
|
```
|
||||||
|
|
||||||
Binary is at `./build/bin/server`
|
Binary is at `./build/bin/llama-server`
|
||||||
|
|
||||||
## Build with SSL
|
## Build with SSL
|
||||||
|
|
||||||
`server` can also be built with SSL support using OpenSSL 3
|
`llama-server` can also be built with SSL support using OpenSSL 3
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
|
|
||||||
|
@ -107,14 +107,14 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
# NOTE: For non-system openssl, use the following:
|
# NOTE: For non-system openssl, use the following:
|
||||||
# CXXFLAGS="-I /path/to/openssl/include"
|
# CXXFLAGS="-I /path/to/openssl/include"
|
||||||
# LDFLAGS="-L /path/to/openssl/lib"
|
# LDFLAGS="-L /path/to/openssl/lib"
|
||||||
make LLAMA_SERVER_SSL=true server
|
make LLAMA_SERVER_SSL=true llama-server
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DLLAMA_SERVER_SSL=ON
|
cmake -B build -DLLAMA_SERVER_SSL=ON
|
||||||
cmake --build build --config Release -t server
|
cmake --build build --config Release -t llama-server
|
||||||
```
|
```
|
||||||
|
|
||||||
## Quick Start
|
## Quick Start
|
||||||
|
@ -124,13 +124,13 @@ To get started right away, run the following command, making sure to use the cor
|
||||||
### Unix-based systems (Linux, macOS, etc.)
|
### Unix-based systems (Linux, macOS, etc.)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./server -m models/7B/ggml-model.gguf -c 2048
|
./llama-server -m models/7B/ggml-model.gguf -c 2048
|
||||||
```
|
```
|
||||||
|
|
||||||
### Windows
|
### Windows
|
||||||
|
|
||||||
```powershell
|
```powershell
|
||||||
server.exe -m models\7B\ggml-model.gguf -c 2048
|
llama-server.exe -m models\7B\ggml-model.gguf -c 2048
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
||||||
|
@ -629,11 +629,11 @@ bash chat.sh
|
||||||
|
|
||||||
### OAI-like API
|
### OAI-like API
|
||||||
|
|
||||||
The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi
|
The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi
|
||||||
|
|
||||||
### API errors
|
### API errors
|
||||||
|
|
||||||
`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
|
`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
|
||||||
|
|
||||||
Example of an error:
|
Example of an error:
|
||||||
|
|
||||||
|
|
|
@ -99,7 +99,7 @@ The `bench.py` script does several steps:
|
||||||
It aims to be used in the CI, but you can run it manually:
|
It aims to be used in the CI, but you can run it manually:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
|
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \
|
||||||
--runner-label local \
|
--runner-label local \
|
||||||
--name local \
|
--name local \
|
||||||
--branch `git rev-parse --abbrev-ref HEAD` \
|
--branch `git rev-parse --abbrev-ref HEAD` \
|
||||||
|
|
|
@ -245,7 +245,7 @@ def start_server(args):
|
||||||
|
|
||||||
def start_server_background(args):
|
def start_server_background(args):
|
||||||
# Start the server
|
# Start the server
|
||||||
server_path = '../../../build/bin/server'
|
server_path = '../../../build/bin/llama-server'
|
||||||
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
||||||
server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||||
server_args = [
|
server_args = [
|
||||||
|
|
|
@ -44,12 +44,12 @@ http module.
|
||||||
|
|
||||||
### running using examples/server
|
### running using examples/server
|
||||||
|
|
||||||
bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
|
./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT]
|
||||||
|
|
||||||
### running using python3's server module
|
### running using python3's server module
|
||||||
|
|
||||||
first run examples/server
|
first run examples/server
|
||||||
* bin/server -m path/model.gguf
|
* ./llama-server -m path/model.gguf
|
||||||
|
|
||||||
next run this web front end in examples/server/public_simplechat
|
next run this web front end in examples/server/public_simplechat
|
||||||
* cd ../examples/server/public_simplechat
|
* cd ../examples/server/public_simplechat
|
||||||
|
|
|
@ -40,7 +40,7 @@ It's possible to override some scenario steps values with environment variables:
|
||||||
| variable | description |
|
| variable | description |
|
||||||
|--------------------------|------------------------------------------------------------------------------------------------|
|
|--------------------------|------------------------------------------------------------------------------------------------|
|
||||||
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
|
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
|
||||||
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/server` |
|
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
|
||||||
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
|
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
|
||||||
| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
|
| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
|
||||||
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
|
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
|
||||||
|
|
|
@ -1272,9 +1272,9 @@ def context_text(context):
|
||||||
|
|
||||||
def start_server_background(context):
|
def start_server_background(context):
|
||||||
if os.name == 'nt':
|
if os.name == 'nt':
|
||||||
context.server_path = '../../../build/bin/Release/server.exe'
|
context.server_path = '../../../build/bin/Release/llama-server.exe'
|
||||||
else:
|
else:
|
||||||
context.server_path = '../../../build/bin/server'
|
context.server_path = '../../../build/bin/llama-server'
|
||||||
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
||||||
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||||
server_listen_addr = context.server_fqdn
|
server_listen_addr = context.server_fqdn
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# GBNF Guide
|
# GBNF Guide
|
||||||
|
|
||||||
GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/server`.
|
GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.org/wiki/Formal_grammar) to constrain model outputs in `llama.cpp`. For example, you can use it to force the model to generate valid JSON, or speak only in emojis. GBNF grammars are supported in various ways in `examples/main` and `examples/llama-server`.
|
||||||
|
|
||||||
## Background
|
## Background
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue