Merge branch 'ggerganov:master' into snapdragonxwin-fix1
This commit is contained in:
commit
3cc2edb073
18 changed files with 31 additions and 27 deletions
|
@ -6,7 +6,7 @@ ARG CUDA_VERSION=11.7.1
|
|||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
|
|
|
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
|
|||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||
|
|
|
@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
|
|||
# Target the CUDA runtime image
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
|
@ -25,7 +25,7 @@ ENV GGML_CUDA=1
|
|||
|
||||
RUN make -j$(nproc) llama-cli
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libgomp1
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
|
@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target llama-cli
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||
|
||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
|
|||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG UBUNTU_VERSION=jammy
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget libgomp1
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git
|
||||
|
@ -11,7 +11,7 @@ COPY . .
|
|||
|
||||
RUN make -j$(nproc) llama-cli
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libgomp1
|
||||
|
|
|
@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
|
|||
# Target the CUDA runtime image
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
|
@ -27,7 +27,7 @@ ENV LLAMA_CURL=1
|
|||
|
||||
RUN make -j$(nproc) llama-server
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||
|
||||
ARG GGML_SYCL_F16=OFF
|
||||
RUN apt-get update && \
|
||||
|
@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
|||
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||
cmake --build build --config Release --target llama-server
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev curl
|
||||
|
|
|
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
|
|||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} AS build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG UBUNTU_VERSION=jammy
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
# Install build tools
|
||||
RUN apt update && apt install -y git build-essential cmake wget
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
FROM ubuntu:$UBUNTU_VERSION AS build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git libcurl4-openssl-dev curl
|
||||
|
@ -13,7 +13,7 @@ ENV LLAMA_CURL=1
|
|||
|
||||
RUN make -j$(nproc) llama-server
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
FROM ubuntu:$UBUNTU_VERSION AS runtime
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
- Test your changes:
|
||||
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||
- If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times
|
||||
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
||||
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience
|
||||
|
||||
|
|
|
@ -37,11 +37,18 @@ struct llama_ngram {
|
|||
}
|
||||
};
|
||||
|
||||
struct llama_token_hash_function {
|
||||
size_t operator()(const llama_token token) const {
|
||||
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
||||
return token * 11400714819323198485llu;
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_ngram_hash_function {
|
||||
size_t operator()(const llama_ngram & ngram) const {
|
||||
size_t hash = 0;
|
||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
|
||||
size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
|
||||
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
||||
hash ^= llama_token_hash_function{}(ngram.tokens[i]);
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
|
|
@ -31,7 +31,6 @@ int main(int argc, char ** argv){
|
|||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
|
@ -65,7 +64,7 @@ int main(int argc, char ** argv){
|
|||
}
|
||||
|
||||
const int n_input = inp.size();
|
||||
const int n_ctx = params.n_ctx;
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
int n_drafted = 0;
|
||||
int n_accept = 0;
|
||||
|
|
|
@ -39,7 +39,6 @@ int main(int argc, char ** argv){
|
|||
|
||||
// load the model
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
|
||||
|
||||
// tokenize the prompt
|
||||
std::vector<llama_token> inp;
|
||||
|
|
|
@ -1182,7 +1182,7 @@ struct server_context {
|
|||
|
||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
|
||||
slot.sampled = result.tok;
|
||||
|
||||
// search stop word and delete it
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue