Merge branch 'ggerganov:master' into snapdragonxwin-fix1

2024-07-18 12:36:21 +02:00 · 2024-07-18 12:36:21 +02:00 · 3cc2edb073
commit 3cc2edb073
parent fb1746f381 3807c3de04
18 changed files with 31 additions and 27 deletions
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -6,7 +6,7 @@ ARG CUDA_VERSION=11.7.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

-FROM ${BASE_CUDA_DEV_CONTAINER} as build
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-FROM ${BASE_ROCM_DEV_CONTAINER} as build
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04

-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
--- a/.devops/llama-cli-cuda.Dockerfile
+++ b/.devops/llama-cli-cuda.Dockerfile
@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

-FROM ${BASE_CUDA_DEV_CONTAINER} as build
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
@ -25,7 +25,7 @@ ENV GGML_CUDA=1

 RUN make -j$(nproc) llama-cli

-FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime

 RUN apt-get update && \
    apt-get install -y libgomp1
--- a/.devops/llama-cli-intel.Dockerfile
+++ b/.devops/llama-cli-intel.Dockerfile
@ -1,6 +1,6 @@
 ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04

-FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-cli

-FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime

 COPY --from=build /app/build/bin/llama-cli /llama-cli

--- a/.devops/llama-cli-rocm.Dockerfile
+++ b/.devops/llama-cli-rocm.Dockerfile
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-FROM ${BASE_ROCM_DEV_CONTAINER} as build
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
--- a/.devops/llama-cli-vulkan.Dockerfile
+++ b/.devops/llama-cli-vulkan.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=jammy

-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build

 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget libgomp1
--- a/.devops/llama-cli.Dockerfile
+++ b/.devops/llama-cli.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04

-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
    apt-get install -y build-essential git
@ -11,7 +11,7 @@ COPY . .

 RUN make -j$(nproc) llama-cli

-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM ubuntu:$UBUNTU_VERSION AS runtime

 RUN apt-get update && \
    apt-get install -y libgomp1
--- a/.devops/llama-server-cuda.Dockerfile
+++ b/.devops/llama-server-cuda.Dockerfile
@ -6,7 +6,7 @@ ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VER
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}

-FROM ${BASE_CUDA_DEV_CONTAINER} as build
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
@ -27,7 +27,7 @@ ENV LLAMA_CURL=1

 RUN make -j$(nproc) llama-server

-FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime

 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1 curl
--- a/.devops/llama-server-intel.Dockerfile
+++ b/.devops/llama-server-intel.Dockerfile
@ -1,6 +1,6 @@
 ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04

-FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build

 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
@ -17,7 +17,7 @@ RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
    cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release --target llama-server

-FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime

 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev curl
--- a/.devops/llama-server-rocm.Dockerfile
+++ b/.devops/llama-server-rocm.Dockerfile
@ -6,7 +6,7 @@ ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete

-FROM ${BASE_ROCM_DEV_CONTAINER} as build
+FROM ${BASE_ROCM_DEV_CONTAINER} AS build

 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
--- a/.devops/llama-server-vulkan.Dockerfile
+++ b/.devops/llama-server-vulkan.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=jammy

-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build

 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
--- a/.devops/llama-server.Dockerfile
+++ b/.devops/llama-server.Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION=22.04

-FROM ubuntu:$UBUNTU_VERSION as build
+FROM ubuntu:$UBUNTU_VERSION AS build

 RUN apt-get update && \
    apt-get install -y build-essential git libcurl4-openssl-dev curl
@ -13,7 +13,7 @@ ENV LLAMA_CURL=1

 RUN make -j$(nproc) llama-server

-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM ubuntu:$UBUNTU_VERSION AS runtime

 RUN apt-get update && \
    apt-get install -y libcurl4-openssl-dev libgomp1
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -5,7 +5,6 @@
 - Test your changes:
  - Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
  - Execute [the full CI locally on your machine](ci/README.md) before publishing
- If the pull request contains only documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times
 - Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
  - The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your conveience

--- a/common/ngram-cache.h
+++ b/common/ngram-cache.h
@ -37,11 +37,18 @@ struct llama_ngram {
    }
 };

+struct llama_token_hash_function {
+    size_t operator()(const llama_token token) const {
+        // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+        return token * 11400714819323198485llu;
+    }
+};
+
 struct llama_ngram_hash_function {
    size_t operator()(const llama_ngram & ngram) const {
-        size_t hash = 0;
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
+        size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
+        for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
+            hash ^= llama_token_hash_function{}(ngram.tokens[i]);
        }
        return hash;
    }
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@ -31,7 +31,6 @@ int main(int argc, char ** argv){

    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
    std::vector<llama_token> inp;
@ -65,7 +64,7 @@ int main(int argc, char ** argv){
    }

    const int n_input = inp.size();
-    const int n_ctx = params.n_ctx;
+    const int n_ctx = llama_n_ctx(ctx);

    int n_drafted = 0;
    int n_accept  = 0;
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -39,7 +39,6 @@ int main(int argc, char ** argv){

    // load the model
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));

    // tokenize the prompt
    std::vector<llama_token> inp;
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1182,7 +1182,7 @@ struct server_context {

    bool process_token(completion_token_output & result, server_slot & slot) {
        // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
+        const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
        slot.sampled = result.tok;

        // search stop word and delete it