Merge branch 'ggerganov:master' into snapdragonxwin-fix1
This commit is contained in:
commit
3cc2edb073
18 changed files with 31 additions and 27 deletions
|
@ -6,7 +6,7 @@ ARG CUDA_VERSION=11.7.1
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# Unless otherwise specified, we make a fat build.
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
|
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# Unless otherwise specified, we make a fat build.
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
|
@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
|
||||||
# Target the CUDA runtime image
|
# Target the CUDA runtime image
|
||||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# Unless otherwise specified, we make a fat build.
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
@ -25,7 +25,7 @@ ENV GGML_CUDA=1
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-cli
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||||
|
|
||||||
ARG GGML_SYCL_F16=OFF
|
ARG GGML_SYCL_F16=OFF
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
|
@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# Unless otherwise specified, we make a fat build.
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
ARG UBUNTU_VERSION=jammy
|
ARG UBUNTU_VERSION=jammy
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
# Install build tools
|
# Install build tools
|
||||||
RUN apt update && apt install -y git build-essential cmake wget libgomp1
|
RUN apt update && apt install -y git build-essential cmake wget libgomp1
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git
|
apt-get install -y build-essential git
|
||||||
|
@ -11,7 +11,7 @@ COPY . .
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-cli
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libgomp1
|
apt-get install -y libgomp1
|
||||||
|
|
|
@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
|
||||||
# Target the CUDA runtime image
|
# Target the CUDA runtime image
|
||||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# Unless otherwise specified, we make a fat build.
|
||||||
ARG CUDA_DOCKER_ARCH=all
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
@ -27,7 +27,7 @@ ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||||
|
|
||||||
ARG GGML_SYCL_F16=OFF
|
ARG GGML_SYCL_F16=OFF
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
|
@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
cmake --build build --config Release --target llama-server
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev curl
|
apt-get install -y libcurl4-openssl-dev curl
|
||||||
|
|
|
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
|
||||||
# Target the CUDA build image
|
# Target the CUDA build image
|
||||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
# Unless otherwise specified, we make a fat build.
|
# Unless otherwise specified, we make a fat build.
|
||||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
ARG UBUNTU_VERSION=jammy
|
ARG UBUNTU_VERSION=jammy
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
# Install build tools
|
# Install build tools
|
||||||
RUN apt update && apt install -y git build-essential cmake wget
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git libcurl4-openssl-dev curl
|
apt-get install -y build-essential git libcurl4-openssl-dev curl
|
||||||
|
@ -13,7 +13,7 @@ ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||||
|
|
|
@ -5,7 +5,6 @@
|
||||||
- Test your changes:
|
- Test your changes:
|
||||||
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
||||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||||
- If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times
|
|
||||||
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
||||||
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience
|
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience
|
||||||
|
|
||||||
|
|
|
@ -37,11 +37,18 @@ struct llama_ngram {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llama_token_hash_function {
|
||||||
|
size_t operator()(const llama_token token) const {
|
||||||
|
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
||||||
|
return token * 11400714819323198485llu;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_ngram_hash_function {
|
struct llama_ngram_hash_function {
|
||||||
size_t operator()(const llama_ngram & ngram) const {
|
size_t operator()(const llama_ngram & ngram) const {
|
||||||
size_t hash = 0;
|
size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
|
hash ^= llama_token_hash_function{}(ngram.tokens[i]);
|
||||||
}
|
}
|
||||||
return hash;
|
return hash;
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,6 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
|
@ -65,7 +64,7 @@ int main(int argc, char ** argv){
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_input = inp.size();
|
const int n_input = inp.size();
|
||||||
const int n_ctx = params.n_ctx;
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
int n_drafted = 0;
|
int n_drafted = 0;
|
||||||
int n_accept = 0;
|
int n_accept = 0;
|
||||||
|
|
|
@ -39,7 +39,6 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
|
|
|
@ -1182,7 +1182,7 @@ struct server_context {
|
||||||
|
|
||||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||||
const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
|
const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
|
||||||
slot.sampled = result.tok;
|
slot.sampled = result.tok;
|
||||||
|
|
||||||
// search stop word and delete it
|
// search stop word and delete it
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue