Merge branch 'master' into xsn/full_image_less
This commit is contained in:
commit
2e15e0fa21
29 changed files with 1364 additions and 321 deletions
|
@ -1,18 +1,16 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
# This needs to generally match the container host's environment.
|
# This needs to generally match the container host's environment.
|
||||||
ARG CUDA_VERSION=11.7.1
|
ARG CUDA_VERSION=12.6.0
|
||||||
|
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# CUDA architecture to build for (defaults to all supported archs)
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=default
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
COPY requirements.txt requirements.txt
|
COPY requirements.txt requirements.txt
|
||||||
COPY requirements requirements
|
COPY requirements requirements
|
||||||
|
@ -24,14 +22,16 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Set nvcc architecture
|
# Use the default CUDA archs if not specified
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
# Then, only build targets used by tools.sh
|
||||||
# Enable CUDA
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
ENV GGML_CUDA=1
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
# Enable cURL
|
fi && \
|
||||||
ENV LLAMA_CURL=1
|
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
|
cmake --build build --config Release --target llama-quantize llama-cli llama-server -j$(nproc) && \
|
||||||
# Only build targets used by tools.sh
|
mv build/bin/* . && \
|
||||||
RUN make -j$(nproc) llama-quantize llama-cli llama-server
|
mv build/src/libllama.so . && \
|
||||||
|
mv build/ggml/src/libggml.so . && \
|
||||||
|
rm -rf build
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
# This needs to generally match the container host's environment.
|
# This needs to generally match the container host's environment.
|
||||||
ARG CUDA_VERSION=11.7.1
|
ARG CUDA_VERSION=12.6.0
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
# Target the CUDA runtime image
|
# Target the CUDA runtime image
|
||||||
|
@ -8,28 +8,30 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# CUDA architecture to build for (defaults to all supported archs)
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=default
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git
|
apt-get install -y build-essential git cmake
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Set nvcc architecture
|
# Use the default CUDA archs if not specified
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
# Enable CUDA
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
ENV GGML_CUDA=1
|
fi && \
|
||||||
|
cmake -B build -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
RUN make -j$(nproc) llama-cli
|
cmake --build build --config Release --target llama-cli -j$(nproc)
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
COPY --from=build /app/llama-cli /llama-cli
|
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||||
|
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||||
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-cli" ]
|
ENTRYPOINT [ "/llama-cli" ]
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
# This needs to generally match the container host's environment.
|
# This needs to generally match the container host's environment.
|
||||||
ARG CUDA_VERSION=11.7.1
|
ARG CUDA_VERSION=12.6.0
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
# Target the CUDA runtime image
|
# Target the CUDA runtime image
|
||||||
|
@ -8,33 +8,34 @@ ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_V
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# CUDA architecture to build for (defaults to all supported archs)
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=default
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
# Set nvcc architecture
|
# Use the default CUDA archs if not specified
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
|
||||||
# Enable CUDA
|
export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
|
||||||
ENV GGML_CUDA=1
|
fi && \
|
||||||
# Enable cURL
|
cmake -B build -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
ENV LLAMA_CURL=1
|
cmake --build build --config Release --target llama-server -j$(nproc)
|
||||||
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
|
||||||
ENV LLAMA_ARG_HOST=0.0.0.0
|
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/llama-server /llama-server
|
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||||
|
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||||
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
|
15
.github/workflows/docker.yml
vendored
15
.github/workflows/docker.yml
vendored
|
@ -96,21 +96,12 @@ jobs:
|
||||||
env:
|
env:
|
||||||
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
|
||||||
|
|
||||||
- name: Build and push Docker image (versioned)
|
- name: Build and push Docker image (tagged + versioned)
|
||||||
if: github.event_name == 'push'
|
if: github.event_name == 'push'
|
||||||
uses: docker/build-push-action@v4
|
uses: docker/build-push-action@v6
|
||||||
with:
|
with:
|
||||||
context: .
|
context: .
|
||||||
push: true
|
push: true
|
||||||
platforms: ${{ matrix.config.platforms }}
|
platforms: ${{ matrix.config.platforms }}
|
||||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
|
||||||
|
|
||||||
- name: Build and push Docker image (tagged)
|
|
||||||
uses: docker/build-push-action@v4
|
|
||||||
with:
|
|
||||||
context: .
|
|
||||||
push: ${{ github.event_name == 'push' }}
|
|
||||||
platforms: ${{ matrix.config.platforms }}
|
|
||||||
tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
|
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
|
@ -251,6 +251,57 @@ int32_t cpu_get_num_math() {
|
||||||
return cpu_get_num_physical_cores();
|
return cpu_get_num_physical_cores();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Helper for setting process priority
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
|
||||||
|
bool set_process_priority(enum ggml_sched_priority prio) {
|
||||||
|
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
DWORD p = NORMAL_PRIORITY_CLASS;
|
||||||
|
switch (prio) {
|
||||||
|
case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break;
|
||||||
|
case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break;
|
||||||
|
case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break;
|
||||||
|
case GGML_SCHED_PRIO_REALTIME: p = REALTIME_PRIORITY_CLASS; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!SetPriorityClass(GetCurrentProcess(), p)) {
|
||||||
|
fprintf(stderr, "warn: failed to set process priority class %d : (%d)\n", prio, (int) GetLastError());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#else // MacOS and POSIX
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/resource.h>
|
||||||
|
|
||||||
|
bool set_process_priority(enum ggml_sched_priority prio) {
|
||||||
|
if (prio == GGML_SCHED_PRIO_NORMAL) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int p = 0;
|
||||||
|
switch (prio) {
|
||||||
|
case GGML_SCHED_PRIO_NORMAL: p = 0; break;
|
||||||
|
case GGML_SCHED_PRIO_MEDIUM: p = -5; break;
|
||||||
|
case GGML_SCHED_PRIO_HIGH: p = -10; break;
|
||||||
|
case GGML_SCHED_PRIO_REALTIME: p = -20; break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!setpriority(PRIO_PROCESS, 0, p)) {
|
||||||
|
fprintf(stderr, "warn: failed to set process priority %d : %s (%d)\n", prio, strerror(errno), errno);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
//
|
//
|
||||||
|
@ -277,6 +328,30 @@ void gpt_params_handle_model_default(gpt_params & params) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model) {
|
||||||
|
int32_t n_set = 0;
|
||||||
|
|
||||||
|
if (cpuparams.n_threads < 0) {
|
||||||
|
// Assuming everything about cpuparams is invalid
|
||||||
|
if (role_model != nullptr) {
|
||||||
|
cpuparams = *role_model;
|
||||||
|
} else {
|
||||||
|
cpuparams.n_threads = cpu_get_num_math();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
|
||||||
|
if (cpuparams.cpumask[i]) {
|
||||||
|
n_set++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_set && n_set < cpuparams.n_threads) {
|
||||||
|
// Not enough set bits, may experience performance issues.
|
||||||
|
fprintf(stderr, "warn: Not enough set bits in CPU mask (%d) to satisfy requested thread count: %d\n", n_set, cpuparams.n_threads);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
bool invalid_param = false;
|
bool invalid_param = false;
|
||||||
std::string arg;
|
std::string arg;
|
||||||
|
@ -296,6 +371,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
postprocess_cpu_params(params.cpuparams, nullptr);
|
||||||
|
postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams);
|
||||||
|
postprocess_cpu_params(params.draft_cpuparams, ¶ms.cpuparams);
|
||||||
|
postprocess_cpu_params(params.draft_cpuparams_batch, ¶ms.cpuparams_batch);
|
||||||
|
|
||||||
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
|
||||||
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
|
||||||
}
|
}
|
||||||
|
@ -331,7 +411,7 @@ void gpt_params_parse_from_env(gpt_params & params) {
|
||||||
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
|
get_env("LLAMA_ARG_MODEL_ALIAS", params.model_alias);
|
||||||
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
|
get_env("LLAMA_ARG_HF_REPO", params.hf_repo);
|
||||||
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
|
get_env("LLAMA_ARG_HF_FILE", params.hf_file);
|
||||||
get_env("LLAMA_ARG_THREADS", params.n_threads);
|
get_env("LLAMA_ARG_THREADS", params.cpuparams.n_threads);
|
||||||
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
|
get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
|
||||||
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
|
get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
|
||||||
get_env("LLAMA_ARG_BATCH", params.n_batch);
|
get_env("LLAMA_ARG_BATCH", params.n_batch);
|
||||||
|
@ -368,6 +448,79 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool parse_cpu_range(const std::string & range, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
||||||
|
size_t dash_loc = range.find('-');
|
||||||
|
if (dash_loc == std::string::npos) {
|
||||||
|
fprintf(stderr, "Format of CPU range is invalid! Expected [<start>]-[<end>].\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t start_i;
|
||||||
|
size_t end_i;
|
||||||
|
|
||||||
|
if (dash_loc == 0) {
|
||||||
|
start_i = 0;
|
||||||
|
} else {
|
||||||
|
start_i = std::stoull(range.substr(0, dash_loc));
|
||||||
|
if (start_i >= GGML_MAX_N_THREADS) {
|
||||||
|
fprintf(stderr, "Start index out of bounds!\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dash_loc == range.length() - 1) {
|
||||||
|
end_i = GGML_MAX_N_THREADS - 1;
|
||||||
|
} else {
|
||||||
|
end_i = std::stoull(range.substr(dash_loc + 1));
|
||||||
|
if (end_i >= GGML_MAX_N_THREADS) {
|
||||||
|
fprintf(stderr, "End index out of bounds!\n");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = start_i; i <= end_i; i++) {
|
||||||
|
boolmask[i] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREADS]) {
|
||||||
|
// Discard potential 0x prefix
|
||||||
|
size_t start_i = 0;
|
||||||
|
if (mask.length() >= 2 && mask.substr(0, 2) == "0x") {
|
||||||
|
start_i = 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t num_digits = mask.length() - start_i;
|
||||||
|
if (num_digits > 128) num_digits = 128;
|
||||||
|
|
||||||
|
size_t end_i = num_digits + start_i;
|
||||||
|
|
||||||
|
for (size_t i = start_i, n = (num_digits*4 - 1); i < end_i; i++, n-=4) {
|
||||||
|
char c = mask.at(i);
|
||||||
|
int8_t id = c;
|
||||||
|
|
||||||
|
if ((c >= '0' && c <= '9')) {
|
||||||
|
id -= '0';
|
||||||
|
} else if (c >= 'a' && c <= 'f') {
|
||||||
|
id -= 'a' - 10;
|
||||||
|
} else if (c >= 'A' && c <= 'F') {
|
||||||
|
id -= 'A' - 10;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "Invalid hex character '%c' at position %d\n", c, int32_t(i));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolmask[ n ] = boolmask[ n ] || ((id & 8) != 0);
|
||||||
|
boolmask[n - 1] = boolmask[n - 1] || ((id & 4) != 0);
|
||||||
|
boolmask[n - 2] = boolmask[n - 2] || ((id & 2) != 0);
|
||||||
|
boolmask[n - 3] = boolmask[n - 3] || ((id & 1) != 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
|
#define CHECK_ARG if (++i >= argc) { invalid_param = true; return true; }
|
||||||
|
|
||||||
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
|
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
|
||||||
|
@ -384,36 +537,142 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
}
|
}
|
||||||
if (arg == "-t" || arg == "--threads") {
|
if (arg == "-t" || arg == "--threads") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.n_threads = std::stoi(argv[i]);
|
params.cpuparams.n_threads = std::stoi(argv[i]);
|
||||||
if (params.n_threads <= 0) {
|
if (params.cpuparams.n_threads <= 0) {
|
||||||
params.n_threads = std::thread::hardware_concurrency();
|
params.cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "-C" || arg == "--cpu-mask") {
|
||||||
|
CHECK_ARG
|
||||||
|
std::string mask = argv[i];
|
||||||
|
params.cpuparams.mask_valid = true;
|
||||||
|
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "-Cr" || arg == "--cpu-range") {
|
||||||
|
CHECK_ARG
|
||||||
|
std::string range = argv[i];
|
||||||
|
params.cpuparams.mask_valid = true;
|
||||||
|
invalid_param = !parse_cpu_range(range, params.cpuparams.cpumask);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--prio") {
|
||||||
|
CHECK_ARG
|
||||||
|
params.cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--cpu-strict") {
|
||||||
|
CHECK_ARG
|
||||||
|
params.cpuparams.strict_cpu = std::stoul(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--poll") {
|
||||||
|
CHECK_ARG
|
||||||
|
params.cpuparams.poll = std::stoul(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "-tb" || arg == "--threads-batch") {
|
if (arg == "-tb" || arg == "--threads-batch") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.n_threads_batch = std::stoi(argv[i]);
|
params.cpuparams_batch.n_threads = std::stoi(argv[i]);
|
||||||
if (params.n_threads_batch <= 0) {
|
if (params.cpuparams_batch.n_threads <= 0) {
|
||||||
params.n_threads_batch = std::thread::hardware_concurrency();
|
params.cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "-Cb" || arg == "--cpu-mask-batch") {
|
||||||
|
CHECK_ARG
|
||||||
|
std::string mask = argv[i];
|
||||||
|
params.cpuparams_batch.mask_valid = true;
|
||||||
|
invalid_param = !parse_cpu_mask(mask, params.cpuparams_batch.cpumask);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "-Crb" || arg == "--cpu-range_batch") {
|
||||||
|
CHECK_ARG
|
||||||
|
std::string range = argv[i];
|
||||||
|
params.cpuparams_batch.mask_valid = true;
|
||||||
|
invalid_param = !parse_cpu_range(range, params.cpuparams_batch.cpumask);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--prio-batch") {
|
||||||
|
CHECK_ARG
|
||||||
|
params.cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--cpu-strict-batch") {
|
||||||
|
params.cpuparams_batch.strict_cpu = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--poll-batch") {
|
||||||
|
CHECK_ARG
|
||||||
|
params.cpuparams_batch.poll = std::stoul(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "-td" || arg == "--threads-draft") {
|
if (arg == "-td" || arg == "--threads-draft") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.n_threads_draft = std::stoi(argv[i]);
|
params.draft_cpuparams.n_threads = std::stoi(argv[i]);
|
||||||
if (params.n_threads_draft <= 0) {
|
if (params.draft_cpuparams.n_threads <= 0) {
|
||||||
params.n_threads_draft = std::thread::hardware_concurrency();
|
params.draft_cpuparams.n_threads = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "-Cd" || arg == "--cpu-mask-draft") {
|
||||||
|
CHECK_ARG
|
||||||
|
std::string mask = argv[i];
|
||||||
|
params.draft_cpuparams.mask_valid = true;
|
||||||
|
invalid_param = !parse_cpu_mask(mask, params.draft_cpuparams.cpumask);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "-Crd" || arg == "--cpu-range-draft") {
|
||||||
|
CHECK_ARG
|
||||||
|
std::string range = argv[i];
|
||||||
|
params.draft_cpuparams.mask_valid = true;
|
||||||
|
invalid_param = !parse_cpu_range(range, params.draft_cpuparams.cpumask);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--prio-draft") {
|
||||||
|
CHECK_ARG
|
||||||
|
params.draft_cpuparams.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--cpu-strict-draft") {
|
||||||
|
params.draft_cpuparams.strict_cpu = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--poll-draft") {
|
||||||
|
CHECK_ARG
|
||||||
|
params.draft_cpuparams.poll = std::stoul(argv[i]);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
if (arg == "-tbd" || arg == "--threads-batch-draft") {
|
if (arg == "-tbd" || arg == "--threads-batch-draft") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.n_threads_batch_draft = std::stoi(argv[i]);
|
params.draft_cpuparams_batch.n_threads = std::stoi(argv[i]);
|
||||||
if (params.n_threads_batch_draft <= 0) {
|
if (params.draft_cpuparams_batch.n_threads <= 0) {
|
||||||
params.n_threads_batch_draft = std::thread::hardware_concurrency();
|
params.draft_cpuparams_batch.n_threads = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "-Crbd" || arg == "--cpu-range-batch-draft") {
|
||||||
|
CHECK_ARG
|
||||||
|
std::string range = argv[i];
|
||||||
|
params.draft_cpuparams_batch.mask_valid = true;
|
||||||
|
invalid_param = !parse_cpu_range(range, params.draft_cpuparams_batch.cpumask);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--prio-batch-draft") {
|
||||||
|
CHECK_ARG
|
||||||
|
params.draft_cpuparams_batch.priority = (enum ggml_sched_priority) std::stoul(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--cpu-strict-batch-draft") {
|
||||||
|
params.draft_cpuparams_batch.strict_cpu = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (arg == "--poll-batch-draft") {
|
||||||
|
CHECK_ARG
|
||||||
|
params.draft_cpuparams_batch.poll = std::stoul(argv[i]);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "-p" || arg == "--prompt") {
|
if (arg == "-p" || arg == "--prompt") {
|
||||||
CHECK_ARG
|
CHECK_ARG
|
||||||
params.prompt = argv[i];
|
params.prompt = argv[i];
|
||||||
|
@ -1498,11 +1757,40 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
|
||||||
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
|
||||||
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
|
||||||
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
|
options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.cpuparams.n_threads });
|
||||||
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
|
||||||
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
|
||||||
options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
|
options.push_back({ "speculative", "-tbd, --threads-batch-draft N","number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
||||||
"number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
|
|
||||||
|
#ifndef GGML_USE_OPENMP
|
||||||
|
// these options are available only with the internal threadpool
|
||||||
|
options.push_back({ "*", "-C, --cpu-mask M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range (default: \"\")"});
|
||||||
|
options.push_back({ "*", "-Cr, --cpu-range lo-hi", "range of CPUs for affinity. Complements --cpu-mask"});
|
||||||
|
options.push_back({ "*", " --cpu-strict <0|1>", "use strict CPU placement (default: %u)\n", (unsigned) params.cpuparams.strict_cpu});
|
||||||
|
options.push_back({ "*", " --priority N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority});
|
||||||
|
options.push_back({ "*", " --poll <0...100>", "use polling level to wait for work (0 - no polling, default: %u)\n", (unsigned) params.cpuparams.poll});
|
||||||
|
|
||||||
|
options.push_back({ "*", "-Cb, --cpu-mask-batch M", "CPU affinity mask: arbitrarily long hex. Complements cpu-range-batch (default: same as --cpu-mask)"});
|
||||||
|
options.push_back({ "*", "-Crb, --cpu-range-batch lo-hi", "ranges of CPUs for affinity. Complements --cpu-mask-batch"});
|
||||||
|
options.push_back({ "*", " --cpu-strict-batch <0|1>","use strict CPU placement (default: same as --cpu-strict)"});
|
||||||
|
options.push_back({ "*", " --priority-batch N", "set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority)"});
|
||||||
|
options.push_back({ "*", " --poll-batch <0|1>", "use polling to wait for work (default: same as --poll"});
|
||||||
|
|
||||||
|
options.push_back({ "speculative", "-Cd, --cpu-mask-draft M", "Draft model CPU affinity mask. Complements cpu-range-draft (default: same as --cpu-mask)"});
|
||||||
|
options.push_back({ "speculative", "-Crd, --cpu-range-draft lo-hi", "Ranges of CPUs for affinity. Complements --cpu-mask-draft"});
|
||||||
|
options.push_back({ "speculative", " --cpu-strict-draft <0|1>","Use strict CPU placement for draft model (default: same as --cpu-strict)"});
|
||||||
|
options.push_back({ "speculative", " --priority-draft N", "Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: same as --priority)"});
|
||||||
|
options.push_back({ "speculative", " --poll-draft <0|1>", "Use polling to wait for draft model work (default: same as --poll])"});
|
||||||
|
|
||||||
|
options.push_back({ "speculative", "-Cbd, --cpu-mask-batch-draft M","Draft model CPU affinity mask. Complements cpu-range-draft-batch (default: same as --cpu-mask-draft)"});
|
||||||
|
options.push_back({ "speculative", "-Crbd, --cpu-range-batch-draft lo-hi",
|
||||||
|
"Ranges of CPUs for affinity. Complements --cpu-mask-draft-batch)"});
|
||||||
|
options.push_back({ "speculative", " --cpu-strict-batch-draft <0|1>",
|
||||||
|
"Use strict CPU placement for draft model (default: --cpu-strict-draft)"});
|
||||||
|
options.push_back({ "speculative", " --priority-batch-draft N","Set draft process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: --priority-draft)"});
|
||||||
|
options.push_back({ "speculative", " --poll-batch-draft <0|1>","Use polling to wait for draft model work (default: --poll-draft)"});
|
||||||
|
#endif // GGML_USE_OPENMP
|
||||||
|
|
||||||
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
|
options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
|
||||||
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
|
options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
|
||||||
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
|
options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
|
||||||
|
@ -1774,7 +2062,6 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
|
options.push_back({ "export-lora", "-m, --model", "model path from which to load base model (default '%s')", params.model.c_str() });
|
||||||
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
|
options.push_back({ "export-lora", " --lora FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)" });
|
||||||
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
options.push_back({ "export-lora", " --lora-scaled FNAME S", "path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters)" });
|
||||||
options.push_back({ "*", "-t, --threads N", "number of threads to use during computation (default: %d)", params.n_threads });
|
|
||||||
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
options.push_back({ "export-lora", "-o, --output FNAME", "output file (default: '%s')", params.lora_outfile.c_str() });
|
||||||
|
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
|
@ -1806,9 +2093,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
|
||||||
std::string gpt_params_get_system_info(const gpt_params & params) {
|
std::string gpt_params_get_system_info(const gpt_params & params) {
|
||||||
std::ostringstream os;
|
std::ostringstream os;
|
||||||
|
|
||||||
os << "system_info: n_threads = " << params.n_threads;
|
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
||||||
if (params.n_threads_batch != -1) {
|
if (params.cpuparams_batch.n_threads != -1) {
|
||||||
os << " (n_threads_batch = " << params.n_threads_batch << ")";
|
os << " (n_threads_batch = " << params.cpuparams_batch.n_threads << ")";
|
||||||
}
|
}
|
||||||
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
#if defined(_WIN32) && (_WIN32_WINNT >= 0x0601) && !defined(__MINGW64__) // windows 7 and later
|
||||||
// TODO: windows + arm64 + mingw64
|
// TODO: windows + arm64 + mingw64
|
||||||
|
@ -2332,8 +2619,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
cparams.n_seq_max = params.n_parallel;
|
cparams.n_seq_max = params.n_parallel;
|
||||||
cparams.n_batch = params.n_batch;
|
cparams.n_batch = params.n_batch;
|
||||||
cparams.n_ubatch = params.n_ubatch;
|
cparams.n_ubatch = params.n_ubatch;
|
||||||
cparams.n_threads = params.n_threads;
|
cparams.n_threads = params.cpuparams.n_threads;
|
||||||
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
|
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
||||||
|
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
||||||
cparams.seed = params.seed;
|
cparams.seed = params.seed;
|
||||||
cparams.logits_all = params.logits_all;
|
cparams.logits_all = params.logits_all;
|
||||||
cparams.embeddings = params.embedding;
|
cparams.embeddings = params.embedding;
|
||||||
|
@ -2359,6 +2647,22 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
return cparams;
|
return cparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params) {
|
||||||
|
struct ggml_threadpool_params tpp;
|
||||||
|
|
||||||
|
ggml_threadpool_params_init(&tpp, params.n_threads); // setup the defaults
|
||||||
|
|
||||||
|
if (params.mask_valid) {
|
||||||
|
std::memcpy(&tpp.cpumask, ¶ms.cpumask, GGML_MAX_N_THREADS);
|
||||||
|
}
|
||||||
|
|
||||||
|
tpp.prio = params.priority;
|
||||||
|
tpp.poll = params.poll;
|
||||||
|
tpp.strict_cpu = params.strict_cpu;
|
||||||
|
|
||||||
|
return tpp;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef LLAMA_USE_CURL
|
#ifdef LLAMA_USE_CURL
|
||||||
|
|
||||||
static bool starts_with(const std::string & str, const std::string & prefix) {
|
static bool starts_with(const std::string & str, const std::string & prefix) {
|
||||||
|
@ -3348,7 +3652,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
|
||||||
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
|
||||||
|
|
||||||
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
|
||||||
fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
|
fprintf(stream, "threads: %d # default: %u\n", params.cpuparams.n_threads, std::thread::hardware_concurrency());
|
||||||
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
|
||||||
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
|
||||||
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
|
||||||
|
|
|
@ -67,13 +67,18 @@ enum dimre_method {
|
||||||
DIMRE_METHOD_MEAN,
|
DIMRE_METHOD_MEAN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct cpu_params {
|
||||||
|
int n_threads = -1;
|
||||||
|
bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
|
||||||
|
bool mask_valid = false; // Default: any CPU
|
||||||
|
enum ggml_sched_priority priority = GGML_SCHED_PRIO_NORMAL; // Scheduling prio : (0 - normal, 1 - medium, 2 - high, 3 - realtime)
|
||||||
|
bool strict_cpu = false; // Use strict CPU placement
|
||||||
|
uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling)
|
||||||
|
};
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
|
||||||
int32_t n_threads = cpu_get_num_math();
|
|
||||||
int32_t n_threads_draft = -1;
|
|
||||||
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
|
|
||||||
int32_t n_threads_batch_draft = -1;
|
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 0; // context size
|
int32_t n_ctx = 0; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
|
@ -100,6 +105,11 @@ struct gpt_params {
|
||||||
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
int32_t yarn_orig_ctx = 0; // YaRN original context length
|
||||||
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
float defrag_thold = -1.0f; // KV cache defragmentation threshold
|
||||||
|
|
||||||
|
struct cpu_params cpuparams;
|
||||||
|
struct cpu_params cpuparams_batch;
|
||||||
|
struct cpu_params draft_cpuparams;
|
||||||
|
struct cpu_params draft_cpuparams_batch;
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
ggml_backend_sched_eval_callback cb_eval = nullptr;
|
||||||
void * cb_eval_user_data = nullptr;
|
void * cb_eval_user_data = nullptr;
|
||||||
|
|
||||||
|
@ -204,7 +214,7 @@ struct gpt_params {
|
||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
int32_t timeout_read = 600; // http read timeout in seconds
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
||||||
int32_t n_threads_http = -1; // number of threads to process HTTP requests
|
int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
||||||
|
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = "";
|
std::string public_path = "";
|
||||||
|
@ -277,6 +287,11 @@ void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||||
|
|
||||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
std::string gpt_params_get_system_info(const gpt_params & params);
|
||||||
|
|
||||||
|
bool parse_cpu_range(const std::string& range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
|
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
|
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
|
||||||
|
bool set_process_priority(enum ggml_sched_priority prio);
|
||||||
|
|
||||||
//
|
//
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
|
@ -327,8 +342,9 @@ struct llama_init_result {
|
||||||
|
|
||||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
||||||
|
|
||||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
|
||||||
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||||
|
|
||||||
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
|
|
|
@ -66,8 +66,8 @@ You may want to pass in some different `ARGS`, depending on the CUDA environment
|
||||||
|
|
||||||
The defaults are:
|
The defaults are:
|
||||||
|
|
||||||
- `CUDA_VERSION` set to `11.7.1`
|
- `CUDA_VERSION` set to `12.6.0`
|
||||||
- `CUDA_DOCKER_ARCH` set to `all`
|
- `CUDA_DOCKER_ARCH` set to the cmake build default, which includes all the supported architectures
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-CUDA images:
|
The resulting images, are essentially the same as the non-CUDA images:
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ constexpr float rms_norm_eps = 5e-6f;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
if (plan.work_size > 0) {
|
||||||
buf.resize(plan.work_size);
|
buf.resize(plan.work_size);
|
||||||
|
|
|
@ -21,7 +21,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
if (plan.work_size > 0) {
|
||||||
buf.resize(plan.work_size);
|
buf.resize(plan.work_size);
|
||||||
|
@ -54,7 +54,7 @@ static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
||||||
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
|
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
|
||||||
|
|
||||||
struct benchmark_params_struct {
|
struct benchmark_params_struct {
|
||||||
int32_t n_threads = 1;
|
int n_threads = 1;
|
||||||
int32_t n_iterations = 10;
|
int32_t n_iterations = 10;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -486,8 +486,8 @@ int main(int argc, char ** argv) {
|
||||||
if (use_pca) {
|
if (use_pca) {
|
||||||
// run PCA
|
// run PCA
|
||||||
PCA::pca_params pca_params;
|
PCA::pca_params pca_params;
|
||||||
pca_params.n_threads = params.n_threads;
|
pca_params.n_threads = params.cpuparams.n_threads;
|
||||||
pca_params.n_batch = params.n_pca_batch;
|
pca_params.n_batch = params.n_pca_batch;
|
||||||
pca_params.n_iterations = params.n_pca_iterations;
|
pca_params.n_iterations = params.n_pca_iterations;
|
||||||
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -410,7 +410,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
g_verbose = (params.verbosity == 1);
|
g_verbose = (params.verbosity == 1);
|
||||||
try {
|
try {
|
||||||
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.n_threads);
|
lora_merge_ctx ctx(params.model, params.lora_adapters, params.lora_outfile, params.cpuparams.n_threads);
|
||||||
ctx.run_merge();
|
ctx.run_merge();
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
fprintf(stderr, "%s\n", err.what());
|
fprintf(stderr, "%s\n", err.what());
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
@ -225,6 +226,9 @@ struct cmd_params {
|
||||||
std::vector<ggml_type> type_k;
|
std::vector<ggml_type> type_k;
|
||||||
std::vector<ggml_type> type_v;
|
std::vector<ggml_type> type_v;
|
||||||
std::vector<int> n_threads;
|
std::vector<int> n_threads;
|
||||||
|
std::vector<std::string> cpu_mask;
|
||||||
|
std::vector<bool> cpu_strict;
|
||||||
|
std::vector<int> poll;
|
||||||
std::vector<int> n_gpu_layers;
|
std::vector<int> n_gpu_layers;
|
||||||
std::vector<std::string> rpc_servers;
|
std::vector<std::string> rpc_servers;
|
||||||
std::vector<llama_split_mode> split_mode;
|
std::vector<llama_split_mode> split_mode;
|
||||||
|
@ -236,6 +240,8 @@ struct cmd_params {
|
||||||
std::vector<bool> embeddings;
|
std::vector<bool> embeddings;
|
||||||
ggml_numa_strategy numa;
|
ggml_numa_strategy numa;
|
||||||
int reps;
|
int reps;
|
||||||
|
ggml_sched_priority prio;
|
||||||
|
int delay;
|
||||||
bool verbose;
|
bool verbose;
|
||||||
output_formats output_format;
|
output_formats output_format;
|
||||||
output_formats output_format_stderr;
|
output_formats output_format_stderr;
|
||||||
|
@ -251,6 +257,9 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* type_k */ {GGML_TYPE_F16},
|
/* type_k */ {GGML_TYPE_F16},
|
||||||
/* type_v */ {GGML_TYPE_F16},
|
/* type_v */ {GGML_TYPE_F16},
|
||||||
/* n_threads */ {cpu_get_num_math()},
|
/* n_threads */ {cpu_get_num_math()},
|
||||||
|
/* cpu_mask */ {"0x0"},
|
||||||
|
/* cpu_strict */ {false},
|
||||||
|
/* poll */ {50},
|
||||||
/* n_gpu_layers */ {99},
|
/* n_gpu_layers */ {99},
|
||||||
/* rpc_servers */ {""},
|
/* rpc_servers */ {""},
|
||||||
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
||||||
|
@ -262,6 +271,8 @@ static const cmd_params cmd_params_defaults = {
|
||||||
/* embeddings */ {false},
|
/* embeddings */ {false},
|
||||||
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
||||||
/* reps */ 5,
|
/* reps */ 5,
|
||||||
|
/* prio */ GGML_SCHED_PRIO_NORMAL,
|
||||||
|
/* delay */ 0,
|
||||||
/* verbose */ false,
|
/* verbose */ false,
|
||||||
/* output_format */ MARKDOWN,
|
/* output_format */ MARKDOWN,
|
||||||
/* output_format_stderr */ NONE,
|
/* output_format_stderr */ NONE,
|
||||||
|
@ -281,6 +292,9 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
printf(" -ctk, --cache-type-k <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
|
||||||
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
|
||||||
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||||
|
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", join(cmd_params_defaults.cpu_mask, ",").c_str());
|
||||||
|
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
||||||
|
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
||||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||||
|
@ -292,6 +306,8 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
||||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||||
|
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
||||||
|
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
||||||
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
|
||||||
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
|
||||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||||
|
@ -338,6 +354,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
|
||||||
params.reps = cmd_params_defaults.reps;
|
params.reps = cmd_params_defaults.reps;
|
||||||
params.numa = cmd_params_defaults.numa;
|
params.numa = cmd_params_defaults.numa;
|
||||||
|
params.prio = cmd_params_defaults.prio;
|
||||||
|
params.delay = cmd_params_defaults.delay;
|
||||||
|
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
arg = argv[i];
|
arg = argv[i];
|
||||||
|
@ -433,6 +451,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
auto p = string_split<int>(argv[i], split_delim);
|
auto p = string_split<int>(argv[i], split_delim);
|
||||||
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "-C" || arg == "--cpu-mask") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = string_split<std::string>(argv[i], split_delim);
|
||||||
|
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "--cpu-strict") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = string_split<bool>(argv[i], split_delim);
|
||||||
|
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
|
||||||
|
} else if (arg == "--poll") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
auto p = string_split<int>(argv[i], split_delim);
|
||||||
|
params.poll.insert(params.poll.end(), p.begin(), p.end());
|
||||||
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -541,6 +580,18 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.reps = std::stoi(argv[i]);
|
params.reps = std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--prio") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
|
||||||
|
} else if (arg == "--delay") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.delay = std::stoi(argv[i]);
|
||||||
} else if (arg == "-o" || arg == "--output") {
|
} else if (arg == "-o" || arg == "--output") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
@ -585,6 +636,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
||||||
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
||||||
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; }
|
||||||
|
if (params.cpu_mask.empty()) { params.cpu_mask = cmd_params_defaults.cpu_mask; }
|
||||||
|
if (params.cpu_strict.empty()) { params.cpu_strict = cmd_params_defaults.cpu_strict; }
|
||||||
|
if (params.poll.empty()) { params.poll = cmd_params_defaults.poll; }
|
||||||
|
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
@ -598,6 +652,9 @@ struct cmd_params_instance {
|
||||||
ggml_type type_k;
|
ggml_type type_k;
|
||||||
ggml_type type_v;
|
ggml_type type_v;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
std::string cpu_mask;
|
||||||
|
bool cpu_strict;
|
||||||
|
int poll;
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
std::string rpc_servers;
|
std::string rpc_servers;
|
||||||
llama_split_mode split_mode;
|
llama_split_mode split_mode;
|
||||||
|
@ -667,7 +724,10 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
for (const auto & tv : params.type_v)
|
for (const auto & tv : params.type_v)
|
||||||
for (const auto & nkvo : params.no_kv_offload)
|
for (const auto & nkvo : params.no_kv_offload)
|
||||||
for (const auto & fa : params.flash_attn)
|
for (const auto & fa : params.flash_attn)
|
||||||
for (const auto & nt : params.n_threads) {
|
for (const auto & nt : params.n_threads)
|
||||||
|
for (const auto & cm : params.cpu_mask)
|
||||||
|
for (const auto & cs : params.cpu_strict)
|
||||||
|
for (const auto & pl : params.poll) {
|
||||||
for (const auto & n_prompt : params.n_prompt) {
|
for (const auto & n_prompt : params.n_prompt) {
|
||||||
if (n_prompt == 0) {
|
if (n_prompt == 0) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -681,6 +741,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .type_k = */ tk,
|
/* .type_k = */ tk,
|
||||||
/* .type_v = */ tv,
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
|
/* .cpu_mask = */ cm,
|
||||||
|
/* .cpu_strict = */ cs,
|
||||||
|
/* .poll = */ pl,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .rpc_servers = */ rpc,
|
/* .rpc_servers = */ rpc,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
|
@ -707,6 +770,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .type_k = */ tk,
|
/* .type_k = */ tk,
|
||||||
/* .type_v = */ tv,
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
|
/* .cpu_mask = */ cm,
|
||||||
|
/* .cpu_strict = */ cs,
|
||||||
|
/* .poll = */ pl,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .rpc_servers = */ rpc,
|
/* .rpc_servers = */ rpc,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
|
@ -733,6 +799,9 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||||
/* .type_k = */ tk,
|
/* .type_k = */ tk,
|
||||||
/* .type_v = */ tv,
|
/* .type_v = */ tv,
|
||||||
/* .n_threads = */ nt,
|
/* .n_threads = */ nt,
|
||||||
|
/* .cpu_mask = */ cm,
|
||||||
|
/* .cpu_strict = */ cs,
|
||||||
|
/* .poll = */ pl,
|
||||||
/* .n_gpu_layers = */ nl,
|
/* .n_gpu_layers = */ nl,
|
||||||
/* .rpc_servers = */ rpc,
|
/* .rpc_servers = */ rpc,
|
||||||
/* .split_mode = */ sm,
|
/* .split_mode = */ sm,
|
||||||
|
@ -769,6 +838,9 @@ struct test {
|
||||||
int n_batch;
|
int n_batch;
|
||||||
int n_ubatch;
|
int n_ubatch;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
std::string cpu_mask;
|
||||||
|
bool cpu_strict;
|
||||||
|
int poll;
|
||||||
bool has_rpc;
|
bool has_rpc;
|
||||||
ggml_type type_k;
|
ggml_type type_k;
|
||||||
ggml_type type_v;
|
ggml_type type_v;
|
||||||
|
@ -795,6 +867,9 @@ struct test {
|
||||||
n_batch = inst.n_batch;
|
n_batch = inst.n_batch;
|
||||||
n_ubatch = inst.n_ubatch;
|
n_ubatch = inst.n_ubatch;
|
||||||
n_threads = inst.n_threads;
|
n_threads = inst.n_threads;
|
||||||
|
cpu_mask = inst.cpu_mask;
|
||||||
|
cpu_strict = inst.cpu_strict;
|
||||||
|
poll = inst.poll;
|
||||||
has_rpc = !inst.rpc_servers.empty();
|
has_rpc = !inst.rpc_servers.empty();
|
||||||
type_k = inst.type_k;
|
type_k = inst.type_k;
|
||||||
type_v = inst.type_v;
|
type_v = inst.type_v;
|
||||||
|
@ -872,13 +947,14 @@ struct test {
|
||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type", "model_size", "model_n_params",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_ubatch",
|
"n_batch", "n_ubatch",
|
||||||
"n_threads", "type_k", "type_v",
|
"n_threads", "cpu_mask", "cpu_strict", "poll",
|
||||||
|
"type_k", "type_v",
|
||||||
"n_gpu_layers", "split_mode",
|
"n_gpu_layers", "split_mode",
|
||||||
"main_gpu", "no_kv_offload", "flash_attn",
|
"main_gpu", "no_kv_offload", "flash_attn",
|
||||||
"tensor_split", "use_mmap", "embeddings",
|
"tensor_split", "use_mmap", "embeddings",
|
||||||
"n_prompt", "n_gen", "test_time",
|
"n_prompt", "n_gen", "test_time",
|
||||||
"avg_ns", "stddev_ns",
|
"avg_ns", "stddev_ns",
|
||||||
"avg_ts", "stddev_ts"
|
"avg_ts", "stddev_ts",
|
||||||
};
|
};
|
||||||
return fields;
|
return fields;
|
||||||
}
|
}
|
||||||
|
@ -887,7 +963,7 @@ struct test {
|
||||||
|
|
||||||
static field_type get_field_type(const std::string & field) {
|
static field_type get_field_type(const std::string & field) {
|
||||||
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" ||
|
||||||
field == "n_threads" ||
|
field == "n_threads" || field == "poll" ||
|
||||||
field == "model_size" || field == "model_n_params" ||
|
field == "model_size" || field == "model_n_params" ||
|
||||||
field == "n_gpu_layers" || field == "main_gpu" ||
|
field == "n_gpu_layers" || field == "main_gpu" ||
|
||||||
field == "n_prompt" || field == "n_gen" ||
|
field == "n_prompt" || field == "n_gen" ||
|
||||||
|
@ -896,6 +972,7 @@ struct test {
|
||||||
}
|
}
|
||||||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
||||||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
||||||
|
field == "cpu_strict" ||
|
||||||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
||||||
return BOOL;
|
return BOOL;
|
||||||
}
|
}
|
||||||
|
@ -928,7 +1005,8 @@ struct test {
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_ubatch),
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
||||||
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
std::to_string(n_threads), cpu_mask, std::to_string(cpu_strict), std::to_string(poll),
|
||||||
|
ggml_type_name(type_k), ggml_type_name(type_v),
|
||||||
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
||||||
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
||||||
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
||||||
|
@ -1067,7 +1145,7 @@ struct markdown_printer : public printer {
|
||||||
return -30;
|
return -30;
|
||||||
}
|
}
|
||||||
if (field == "t/s") {
|
if (field == "t/s") {
|
||||||
return 16;
|
return 20;
|
||||||
}
|
}
|
||||||
if (field == "size" || field == "params") {
|
if (field == "size" || field == "params") {
|
||||||
return 10;
|
return 10;
|
||||||
|
@ -1149,6 +1227,15 @@ struct markdown_printer : public printer {
|
||||||
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
|
||||||
fields.emplace_back("n_threads");
|
fields.emplace_back("n_threads");
|
||||||
}
|
}
|
||||||
|
if (params.cpu_mask.size() > 1 || params.cpu_mask != cmd_params_defaults.cpu_mask) {
|
||||||
|
fields.emplace_back("cpu_mask");
|
||||||
|
}
|
||||||
|
if (params.cpu_strict.size() > 1 || params.cpu_strict != cmd_params_defaults.cpu_strict) {
|
||||||
|
fields.emplace_back("cpu_strict");
|
||||||
|
}
|
||||||
|
if (params.poll.size() > 1 || params.poll != cmd_params_defaults.poll) {
|
||||||
|
fields.emplace_back("poll");
|
||||||
|
}
|
||||||
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
|
||||||
fields.emplace_back("n_batch");
|
fields.emplace_back("n_batch");
|
||||||
}
|
}
|
||||||
|
@ -1383,6 +1470,8 @@ int main(int argc, char ** argv) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
|
set_process_priority(params.prio);
|
||||||
|
|
||||||
// initialize printer
|
// initialize printer
|
||||||
std::unique_ptr<printer> p = create_printer(params.output_format);
|
std::unique_ptr<printer> p = create_printer(params.output_format);
|
||||||
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
|
||||||
|
@ -1428,6 +1517,28 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
|
// cool off before the test
|
||||||
|
if (params.delay) {
|
||||||
|
std::this_thread::sleep_for(std::chrono::seconds(params.delay));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(t.n_threads);
|
||||||
|
if (!parse_cpu_mask(t.cpu_mask, tpp.cpumask)) {
|
||||||
|
LOG_TEE("%s: failed to parse cpu-mask: %s\n", __func__, t.cpu_mask.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
tpp.strict_cpu = t.cpu_strict;
|
||||||
|
tpp.poll = t.poll;
|
||||||
|
tpp.prio = params.prio;
|
||||||
|
|
||||||
|
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
|
||||||
|
if (!threadpool) {
|
||||||
|
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_attach_threadpool(ctx, threadpool, NULL);
|
||||||
|
|
||||||
// warmup run
|
// warmup run
|
||||||
if (t.n_prompt > 0) {
|
if (t.n_prompt > 0) {
|
||||||
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
|
||||||
|
@ -1466,6 +1577,8 @@ int main(int argc, char ** argv) {
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
|
ggml_threadpool_free(threadpool);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(lmodel);
|
llama_free_model(lmodel);
|
||||||
|
|
|
@ -71,8 +71,8 @@ actor LlamaContext {
|
||||||
var ctx_params = llama_context_default_params()
|
var ctx_params = llama_context_default_params()
|
||||||
ctx_params.seed = 1234
|
ctx_params.seed = 1234
|
||||||
ctx_params.n_ctx = 2048
|
ctx_params.n_ctx = 2048
|
||||||
ctx_params.n_threads = UInt32(n_threads)
|
ctx_params.n_threads = Int32(n_threads)
|
||||||
ctx_params.n_threads_batch = UInt32(n_threads)
|
ctx_params.n_threads_batch = Int32(n_threads)
|
||||||
|
|
||||||
let context = llama_new_context_with_model(model, ctx_params)
|
let context = llama_new_context_with_model(model, ctx_params)
|
||||||
guard let context else {
|
guard let context else {
|
||||||
|
|
|
@ -1623,7 +1623,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
inline float clip(float x, float lower, float upper) {
|
inline int clip(int x, int lower, int upper) {
|
||||||
return std::max(lower, std::min(x, upper));
|
return std::max(lower, std::min(x, upper));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1827,10 +1827,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
|
||||||
return refine_size;
|
return refine_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline int clip(int x, int lower, int upper) {
|
|
||||||
return std::max(lower, std::min(x, upper));
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
|
||||||
std::vector<int> candidate_split_grids_nums;
|
std::vector<int> candidate_split_grids_nums;
|
||||||
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
for (int i : {multiple - 1, multiple, multiple + 1}) {
|
||||||
|
|
|
@ -129,14 +129,14 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
||||||
if (!params->image.empty()) {
|
if (!params->image.empty()) {
|
||||||
LOG_TEE("using base64 encoded image instead of command line image path\n");
|
LOG_TEE("using base64 encoded image instead of command line image path\n");
|
||||||
}
|
}
|
||||||
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt);
|
embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
|
||||||
if (!embed) {
|
if (!embed) {
|
||||||
LOG_TEE("%s: can't load image from prompt\n", __func__);
|
LOG_TEE("%s: can't load image from prompt\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
params->prompt = remove_image_from_prompt(prompt);
|
params->prompt = remove_image_from_prompt(prompt);
|
||||||
} else {
|
} else {
|
||||||
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
|
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||||
if (!embed) {
|
if (!embed) {
|
||||||
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
|
@ -180,7 +180,7 @@ static const char * sample(struct llama_sampling_context * ctx_sampling,
|
||||||
|
|
||||||
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
||||||
auto ctx_clip = clip_init_context(params);
|
auto ctx_clip = clip_init_context(params);
|
||||||
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str());
|
auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||||
if (!embeds) {
|
if (!embeds) {
|
||||||
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
std::cerr << "error: failed to load image " << fname << ". Terminating\n\n";
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
|
@ -221,6 +221,40 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG("%s: llama threadpool init = n_threads = %d\n",
|
||||||
|
__func__,
|
||||||
|
(int) params.cpuparams.n_threads
|
||||||
|
);
|
||||||
|
struct ggml_threadpool_params tpp_batch =
|
||||||
|
ggml_threadpool_params_from_cpu_params(params.cpuparams_batch);
|
||||||
|
struct ggml_threadpool_params tpp =
|
||||||
|
ggml_threadpool_params_from_cpu_params(params.cpuparams);
|
||||||
|
|
||||||
|
set_process_priority(params.cpuparams.priority);
|
||||||
|
|
||||||
|
struct ggml_threadpool * threadpool_batch = NULL;
|
||||||
|
if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) {
|
||||||
|
threadpool_batch = ggml_threadpool_new(&tpp_batch);
|
||||||
|
if (!threadpool_batch) {
|
||||||
|
LOG_TEE("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start the non-batch threadpool in the paused state
|
||||||
|
tpp.paused = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp);
|
||||||
|
if (!threadpool) {
|
||||||
|
LOG_TEE("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_attach_threadpool(ctx, threadpool, threadpool_batch);
|
||||||
|
if (ctx_guidance) {
|
||||||
|
llama_attach_threadpool(ctx_guidance, threadpool, threadpool_batch);
|
||||||
|
}
|
||||||
|
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
LOG("n_ctx: %d\n", n_ctx);
|
LOG("n_ctx: %d\n", n_ctx);
|
||||||
|
@ -989,6 +1023,9 @@ int main(int argc, char ** argv) {
|
||||||
llama_sampling_free(ctx_sampling);
|
llama_sampling_free(ctx_sampling);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
|
ggml_threadpool_free(threadpool);
|
||||||
|
ggml_threadpool_free(threadpool_batch);
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
LOG_TEE("Log end\n");
|
LOG_TEE("Log end\n");
|
||||||
#endif // LOG_DISABLE_LOGS
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
|
@ -2534,8 +2534,8 @@ int main(int argc, char ** argv) {
|
||||||
});
|
});
|
||||||
|
|
||||||
LOG_INFO("system info", {
|
LOG_INFO("system info", {
|
||||||
{"n_threads", params.n_threads},
|
{"n_threads", params.cpuparams.n_threads},
|
||||||
{"n_threads_batch", params.n_threads_batch},
|
{"n_threads_batch", params.cpuparams_batch.n_threads},
|
||||||
{"total_threads", std::thread::hardware_concurrency()},
|
{"total_threads", std::thread::hardware_concurrency()},
|
||||||
{"system_info", llama_print_system_info()},
|
{"system_info", llama_print_system_info()},
|
||||||
});
|
});
|
||||||
|
@ -2572,7 +2572,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
auto res_error = [](httplib::Response & res, json error_data) {
|
auto res_error = [](httplib::Response & res, json error_data) {
|
||||||
json final_response {{"error", error_data}};
|
json final_response {{"error", error_data}};
|
||||||
res.set_content(final_response.dump(), MIMETYPE_JSON);
|
res.set_content(final_response.dump(-1, ' ', false, json::error_handler_t::replace), MIMETYPE_JSON);
|
||||||
res.status = json_value(error_data, "code", 500);
|
res.status = json_value(error_data, "code", 500);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
|
||||||
// load the draft model
|
// load the draft model
|
||||||
params.model = params.model_draft;
|
params.model = params.model_draft;
|
||||||
params.n_gpu_layers = params.n_gpu_layers_draft;
|
params.n_gpu_layers = params.n_gpu_layers_draft;
|
||||||
if (params.n_threads_draft > 0) {
|
if (params.draft_cpuparams.n_threads > 0) {
|
||||||
params.n_threads = params.n_threads_draft;
|
params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
|
||||||
}
|
}
|
||||||
params.n_threads_batch = params.n_threads_batch_draft;
|
|
||||||
|
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
|
||||||
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
|
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
|
||||||
model_dft = llama_init_dft.model;
|
model_dft = llama_init_dft.model;
|
||||||
ctx_dft = llama_init_dft.context;
|
ctx_dft = llama_init_dft.context;
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1723637854,
|
"lastModified": 1724224976,
|
||||||
"narHash": "sha256-med8+5DSWa2UnOqtdICndjDAEjxr5D7zaIiK4pn0Q7c=",
|
"narHash": "sha256-Z/ELQhrSd7bMzTO8r7NZgi9g5emh+aRKoCdaAv5fiO0=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "c3aa7b8938b17aebd2deecf7be0636000d62a2b9",
|
"rev": "c374d94f1536013ca8e92341b540eba4c22f9c62",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
|
@ -7,8 +7,8 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
|
||||||
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
|
||||||
typedef struct ggml_backend * ggml_backend_t;
|
typedef struct ggml_backend * ggml_backend_t;
|
||||||
|
|
||||||
// Tensor allocator
|
// Tensor allocator
|
||||||
struct ggml_tallocr {
|
struct ggml_tallocr {
|
||||||
|
|
|
@ -103,6 +103,7 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||||
|
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||||
|
|
||||||
// Create a backend buffer from an existing pointer
|
// Create a backend buffer from an existing pointer
|
||||||
|
|
|
@ -231,6 +231,8 @@
|
||||||
#define GGML_MAX_SRC 10
|
#define GGML_MAX_SRC 10
|
||||||
#ifndef GGML_MAX_NAME
|
#ifndef GGML_MAX_NAME
|
||||||
#define GGML_MAX_NAME 64
|
#define GGML_MAX_NAME 64
|
||||||
|
#define GGML_MAX_N_THREADS 512
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
#define GGML_MAX_OP_PARAMS 64
|
#define GGML_MAX_OP_PARAMS 64
|
||||||
#define GGML_DEFAULT_N_THREADS 4
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
|
@ -628,6 +630,29 @@ extern "C" {
|
||||||
// If it returns true, the computation is aborted
|
// If it returns true, the computation is aborted
|
||||||
typedef bool (*ggml_abort_callback)(void * data);
|
typedef bool (*ggml_abort_callback)(void * data);
|
||||||
|
|
||||||
|
// Scheduling priorities
|
||||||
|
enum ggml_sched_priority {
|
||||||
|
GGML_SCHED_PRIO_NORMAL,
|
||||||
|
GGML_SCHED_PRIO_MEDIUM,
|
||||||
|
GGML_SCHED_PRIO_HIGH,
|
||||||
|
GGML_SCHED_PRIO_REALTIME
|
||||||
|
};
|
||||||
|
|
||||||
|
// Threadpool params
|
||||||
|
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
|
||||||
|
struct ggml_threadpool_params {
|
||||||
|
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
|
||||||
|
int n_threads; // number of threads
|
||||||
|
enum ggml_sched_priority prio; // thread priority
|
||||||
|
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
|
||||||
|
bool strict_cpu; // strict cpu placement
|
||||||
|
bool paused; // start in paused state
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_threadpool; // forward declaration, see ggml.c
|
||||||
|
|
||||||
|
typedef struct ggml_threadpool * ggml_threadpool_t;
|
||||||
|
|
||||||
// the compute plan that needs to be prepared for ggml_graph_compute()
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||||
// since https://github.com/ggerganov/ggml/issues/287
|
// since https://github.com/ggerganov/ggml/issues/287
|
||||||
struct ggml_cplan {
|
struct ggml_cplan {
|
||||||
|
@ -635,6 +660,7 @@ extern "C" {
|
||||||
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||||
|
|
||||||
int n_threads;
|
int n_threads;
|
||||||
|
struct ggml_threadpool * threadpool;
|
||||||
|
|
||||||
// abort ggml_graph_compute when true
|
// abort ggml_graph_compute when true
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback abort_callback;
|
||||||
|
@ -2057,10 +2083,23 @@ extern "C" {
|
||||||
GGML_API size_t ggml_graph_overhead(void);
|
GGML_API size_t ggml_graph_overhead(void);
|
||||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||||
|
|
||||||
|
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||||
|
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
|
||||||
|
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
|
||||||
|
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||||
|
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||||
|
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||||
|
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||||
|
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||||
|
|
||||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
GGML_API struct ggml_cplan ggml_graph_plan(
|
||||||
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
const struct ggml_cgraph * cgraph,
|
||||||
|
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||||
|
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||||
|
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||||
|
|
||||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||||
|
|
|
@ -1247,7 +1247,7 @@ endif()
|
||||||
|
|
||||||
# Data types, macros and functions related to controlling CPU affinity and
|
# Data types, macros and functions related to controlling CPU affinity and
|
||||||
# some memory allocation are available on Linux through GNU extensions in libc
|
# some memory allocation are available on Linux through GNU extensions in libc
|
||||||
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
|
if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||||
add_compile_definitions(_GNU_SOURCE)
|
add_compile_definitions(_GNU_SOURCE)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
|
@ -722,9 +722,11 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
struct ggml_backend_cpu_context {
|
struct ggml_backend_cpu_context {
|
||||||
int n_threads;
|
int n_threads;
|
||||||
void * work_data;
|
ggml_threadpool_t threadpool;
|
||||||
size_t work_size;
|
|
||||||
|
void * work_data;
|
||||||
|
size_t work_size;
|
||||||
|
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback abort_callback;
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
|
@ -759,7 +761,7 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
||||||
|
|
||||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||||
|
|
||||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||||
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
if (cpu_plan->cplan.work_size > 0) {
|
||||||
|
@ -796,7 +798,7 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backe
|
||||||
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||||
|
|
||||||
if (cpu_ctx->work_size < cplan.work_size) {
|
if (cpu_ctx->work_size < cplan.work_size) {
|
||||||
free(cpu_ctx->work_data);
|
free(cpu_ctx->work_data);
|
||||||
|
@ -873,6 +875,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
|
ctx->threadpool = NULL;
|
||||||
ctx->work_data = NULL;
|
ctx->work_data = NULL;
|
||||||
ctx->work_size = 0;
|
ctx->work_size = 0;
|
||||||
ctx->abort_callback = NULL;
|
ctx->abort_callback = NULL;
|
||||||
|
@ -903,6 +906,18 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||||
ctx->n_threads = n_threads;
|
ctx->n_threads = n_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
||||||
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||||
|
|
||||||
|
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||||
|
|
||||||
|
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
||||||
|
// already had a different threadpool, pause/suspend it before switching
|
||||||
|
ggml_threadpool_pause(ctx->threadpool);
|
||||||
|
}
|
||||||
|
ctx->threadpool = threadpool;
|
||||||
|
}
|
||||||
|
|
||||||
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
||||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||||
|
|
||||||
|
|
850
ggml/src/ggml.c
850
ggml/src/ggml.c
File diff suppressed because it is too large
Load diff
|
@ -304,8 +304,8 @@ extern "C" {
|
||||||
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
|
||||||
uint32_t n_ubatch; // physical maximum batch size
|
uint32_t n_ubatch; // physical maximum batch size
|
||||||
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
|
||||||
uint32_t n_threads; // number of threads to use for generation
|
int32_t n_threads; // number of threads to use for generation
|
||||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
int32_t n_threads_batch; // number of threads to use for batch processing
|
||||||
|
|
||||||
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||||
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
|
||||||
|
@ -428,6 +428,13 @@ extern "C" {
|
||||||
//optional:
|
//optional:
|
||||||
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
|
||||||
|
|
||||||
|
// Optional: an auto threadpool gets created in ggml if not passed explicitly
|
||||||
|
LLAMA_API void llama_attach_threadpool(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
ggml_threadpool_t threadpool,
|
||||||
|
ggml_threadpool_t threadpool_batch);
|
||||||
|
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
|
||||||
|
|
||||||
// Call once at the end of the program - currently only used for MPI
|
// Call once at the end of the program - currently only used for MPI
|
||||||
LLAMA_API void llama_backend_free(void);
|
LLAMA_API void llama_backend_free(void);
|
||||||
|
|
||||||
|
@ -837,13 +844,13 @@ extern "C" {
|
||||||
// Set the number of threads used for decoding
|
// Set the number of threads used for decoding
|
||||||
// n_threads is the number of threads used for generation (single token)
|
// n_threads is the number of threads used for generation (single token)
|
||||||
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
|
||||||
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
|
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
|
||||||
|
|
||||||
// Get the number of threads used for generation of a single token.
|
// Get the number of threads used for generation of a single token.
|
||||||
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
|
LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
|
||||||
|
|
||||||
// Get the number of threads used for prompt and batch processing (multiple token).
|
// Get the number of threads used for prompt and batch processing (multiple token).
|
||||||
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
|
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||||
|
|
||||||
// Set whether the model is in embeddings mode or not
|
// Set whether the model is in embeddings mode or not
|
||||||
// If true, embeddings will be returned but logits will not
|
// If true, embeddings will be returned but logits will not
|
||||||
|
|
|
@ -2373,8 +2373,8 @@ struct llama_cparams {
|
||||||
uint32_t n_batch;
|
uint32_t n_batch;
|
||||||
uint32_t n_ubatch;
|
uint32_t n_ubatch;
|
||||||
uint32_t n_seq_max;
|
uint32_t n_seq_max;
|
||||||
uint32_t n_threads; // number of threads to use for generation
|
int n_threads; // number of threads to use for generation
|
||||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
int n_threads_batch; // number of threads to use for batch processing
|
||||||
|
|
||||||
float rope_freq_base;
|
float rope_freq_base;
|
||||||
float rope_freq_scale;
|
float rope_freq_scale;
|
||||||
|
@ -3091,6 +3091,9 @@ struct llama_context {
|
||||||
#endif
|
#endif
|
||||||
ggml_backend_t backend_cpu = nullptr;
|
ggml_backend_t backend_cpu = nullptr;
|
||||||
|
|
||||||
|
ggml_threadpool_t threadpool = nullptr;
|
||||||
|
ggml_threadpool_t threadpool_batch = nullptr;
|
||||||
|
|
||||||
bool has_evaluated_once = false;
|
bool has_evaluated_once = false;
|
||||||
|
|
||||||
int64_t t_start_us;
|
int64_t t_start_us;
|
||||||
|
@ -15494,9 +15497,10 @@ static void llama_output_reorder(struct llama_context * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_graph_compute(
|
static void llama_graph_compute(
|
||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
int n_threads) {
|
int n_threads,
|
||||||
|
ggml_threadpool * threadpool) {
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
||||||
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
||||||
|
@ -15505,6 +15509,7 @@ static void llama_graph_compute(
|
||||||
|
|
||||||
if (lctx.backend_cpu != nullptr) {
|
if (lctx.backend_cpu != nullptr) {
|
||||||
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
||||||
|
ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool);
|
||||||
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data);
|
||||||
}
|
}
|
||||||
#ifdef GGML_USE_BLAS
|
#ifdef GGML_USE_BLAS
|
||||||
|
@ -15625,6 +15630,8 @@ static int llama_decode_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
||||||
|
ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
||||||
|
|
||||||
GGML_ASSERT(n_threads > 0);
|
GGML_ASSERT(n_threads > 0);
|
||||||
|
|
||||||
// non-causal masks do not use the KV cache
|
// non-causal masks do not use the KV cache
|
||||||
|
@ -15686,7 +15693,7 @@ static int llama_decode_internal(
|
||||||
|
|
||||||
llama_set_inputs(lctx, ubatch);
|
llama_set_inputs(lctx, ubatch);
|
||||||
|
|
||||||
llama_graph_compute(lctx, gf, n_threads);
|
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
||||||
|
|
||||||
// update the kv ring buffer
|
// update the kv ring buffer
|
||||||
{
|
{
|
||||||
|
@ -15863,7 +15870,9 @@ static int llama_encode_internal(
|
||||||
lctx.inp_embd_enc = NULL;
|
lctx.inp_embd_enc = NULL;
|
||||||
lctx.n_outputs = n_tokens;
|
lctx.n_outputs = n_tokens;
|
||||||
|
|
||||||
const int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
||||||
|
ggml_threadpool_t threadpool = n_tokens == 1 ? lctx.threadpool : lctx.threadpool_batch;
|
||||||
|
|
||||||
GGML_ASSERT(n_threads > 0);
|
GGML_ASSERT(n_threads > 0);
|
||||||
|
|
||||||
ggml_backend_sched_reset(lctx.sched);
|
ggml_backend_sched_reset(lctx.sched);
|
||||||
|
@ -15895,7 +15904,7 @@ static int llama_encode_internal(
|
||||||
|
|
||||||
llama_set_inputs(lctx, ubatch);
|
llama_set_inputs(lctx, ubatch);
|
||||||
|
|
||||||
llama_graph_compute(lctx, gf, n_threads);
|
llama_graph_compute(lctx, gf, n_threads, threadpool);
|
||||||
|
|
||||||
// extract embeddings
|
// extract embeddings
|
||||||
if (embd) {
|
if (embd) {
|
||||||
|
@ -16177,7 +16186,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
||||||
|
|
||||||
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
||||||
|
|
||||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//const int64_t t_end = ggml_time_us();
|
//const int64_t t_end = ggml_time_us();
|
||||||
|
@ -16203,7 +16212,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
||||||
|
|
||||||
llama_set_k_shift(lctx);
|
llama_set_k_shift(lctx);
|
||||||
|
|
||||||
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
llama_graph_compute(lctx, gf, lctx.cparams.n_threads, lctx.threadpool);
|
||||||
|
|
||||||
need_reserve = true;
|
need_reserve = true;
|
||||||
}
|
}
|
||||||
|
@ -17451,6 +17460,19 @@ void llama_numa_init(enum ggml_numa_strategy numa) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_attach_threadpool(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
ggml_threadpool_t threadpool,
|
||||||
|
ggml_threadpool_t threadpool_batch) {
|
||||||
|
ctx->threadpool = threadpool;
|
||||||
|
ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_detach_threadpool(struct llama_context * ctx) {
|
||||||
|
ctx->threadpool = nullptr;
|
||||||
|
ctx->threadpool_batch = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
void llama_backend_free(void) {
|
void llama_backend_free(void) {
|
||||||
ggml_quantize_free();
|
ggml_quantize_free();
|
||||||
}
|
}
|
||||||
|
@ -19367,16 +19389,16 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) {
|
||||||
ctx->cparams.n_threads = n_threads;
|
ctx->cparams.n_threads = n_threads;
|
||||||
ctx->cparams.n_threads_batch = n_threads_batch;
|
ctx->cparams.n_threads_batch = n_threads_batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t llama_n_threads(struct llama_context * ctx) {
|
int32_t llama_n_threads(struct llama_context * ctx) {
|
||||||
return ctx->cparams.n_threads;
|
return ctx->cparams.n_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t llama_n_threads_batch(struct llama_context * ctx) {
|
int32_t llama_n_threads_batch(struct llama_context * ctx) {
|
||||||
return ctx->cparams.n_threads_batch;
|
return ctx->cparams.n_threads_batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -113,7 +113,7 @@ static struct ggml_tensor * get_random_tensor_f32(
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
if (plan.work_size > 0) {
|
||||||
buf.resize(plan.work_size);
|
buf.resize(plan.work_size);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue