Merge branch 'master' into master

2023-07-21 13:52:25 +03:00 · 2023-07-21 13:52:25 +03:00 · eef66e1d2e
commit eef66e1d2e
parent 569916ab44 a814d04f81
14 changed files with 617 additions and 523 deletions
--- a/.gitignore
+++ b/.gitignore
@ -62,6 +62,16 @@ perf-*.txt
 examples/jeopardy/results.txt
 pyproject.toml
 poetry.lock
 poetry.toml
 # Test binaries
 tests/test-double-float
 tests/test-grad0
 tests/test-opt
 tests/test-quantize-fns
 tests/test-quantize-perf
 tests/test-sampling
 tests/test-tokenizer-0
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -186,16 +186,7 @@ if (LLAMA_BLAS)
                pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
                # all Intel* libraries share the same include path
-                pkg_check_modules(DepBLAS mkl-sdl)
+                pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
                if (NOT DepBLAS)
                    if (BUILD_SHARED_LIBS)
                        set(LINK_METHOD dynamic)
                    else()
                        set(LINK_METHOD static)
                    endif()
                    string(REGEX REPLACE ".*_" "" DATA_TYPE_MODEL ${LLAMA_BLAS_VENDOR})
                    pkg_check_modules(DepBLAS REQUIRED mkl-${LINK_METHOD}-${DATA_TYPE_MODEL}-iomp)
                endif()
            elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
                # this doesn't provide pkg-config
                # suggest to assign BLAS_INCLUDE_DIRS on your own
--- a/72
+++ b/72
@ -1,5 +1,8 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server libembdinput.so embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
 # Binaries only useful for tests
 TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
 default: $(BUILD_TARGETS)
@ -90,6 +93,28 @@ ifeq ($(UNAME_S),Haiku)
 	CXXFLAGS += -pthread
 endif
 # detect Windows
 ifneq ($(findstring _NT,$(UNAME_S)),)
 	_WIN32 := 1
 endif
 # library name prefix
 ifneq ($(_WIN32),1)
 	LIB_PRE := lib
 endif
 # Dynamic Shared Object extension
 ifneq ($(_WIN32),1)
 	DSO_EXT := .so
 else
 	DSO_EXT := .dll
 endif
 # Windows Sockets 2 (Winsock) for network-capable apps
 ifeq ($(_WIN32),1)
 	LWINSOCK2 := -lws2_32
 endif
 ifdef LLAMA_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
@ -168,8 +193,12 @@ ifdef LLAMA_CUBLAS
 	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
 	NVCC      = nvcc
 	NVCCFLAGS = --forward-unknown-to-host-compiler
 ifdef LLAMA_CUDA_NVCC
 	NVCC = $(LLAMA_CUDA_NVCC)
 else
 	NVCC = nvcc
 endif #LLAMA_CUDA_NVCC
 ifdef CUDA_DOCKER_ARCH
 	NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
 else
@ -198,7 +227,9 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
 else
 	NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
 endif
-
+ifdef LLAMA_CUDA_CCBIN
 	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS
@ -294,7 +325,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 clean:
-	rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
 #
 # Examples
@ -325,14 +356,14 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
-libembdinput.so: examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
-embd-input-test: libembdinput.so examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
-	$(CXX) $(CXXFLAGS) $(filter-out %.so,$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
+	$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@ -349,6 +380,8 @@ build-info.h: $(wildcard .git/index) scripts/build-info.sh
 # Tests
 #
 tests: $(TEST_TARGETS)
 benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	./$@
@ -356,6 +389,23 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
-.PHONY: tests clean
+tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS)
-tests:
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
-	bash ./tests/run-tests.sh
+
 tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
 tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
 tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
 tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
 tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
 tests/test-tokenizer-0: tests/test-tokenizer-0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
--- a/README.md
+++ b/README.md
@ -360,7 +360,7 @@ Building the program with BLAS support may lead to some performance improvements
  ```bash
  mkdir build
  cd build
-  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_lp64 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+  cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
  cmake --build . --config Release
  ```
--- a/ci/run.sh
+++ b/ci/run.sh
@ -243,7 +243,7 @@ function gg_sum_open_llama_3b_v2 {
 if [ -z $GG_BUILD_LOW_PERF ]; then
    rm -rf ${SRC}/models-mnt
-    mnt_models=$(realpath ${MNT}/models)
+    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
    ln -sfn ${mnt_models} ${SRC}/models-mnt
--- a/examples/Miku.sh
+++ b/examples/Miku.sh
@ -2,21 +2,21 @@
 set -e
 AI_NAME="${AI_NAME:-Miku}"
-MODEL="${MODEL:-./models/gpt4all-7B/gpt4all-lora-unfiltered-quantized.bin}"
+MODEL="${MODEL:-./models/llama-2-7b-chat.ggmlv3.q4_K_M.bin}"
 USER_NAME="${USER_NAME:-Anon}"
 # Uncomment and adjust to the number of CPU cores you want to use.
 #N_THREAD="${N_THREAD:-4}"
 CTX_SIZE="${CTX_SIZE:-4096}"
 N_PREDICTS="${N_PREDICTS:-4096}"
 GEN_OPTIONS=(--batch_size 1024
--ctx_size 2048
+--ctx_size "$CTX_SIZE"
 --keep -1
 --repeat_last_n 256
 --repeat_penalty 1.17647
--temp 0.7
+--temp 0.6
--top_k 40
+--mirostat 2)
 --top_p 0.5)
 if [ -n "$N_THREAD" ]; then
    GEN_OPTIONS+=(--threads "$N_THREAD")
@ -24,16 +24,17 @@ fi
 ./main "${GEN_OPTIONS[@]}" \
    --model "$MODEL" \
    --in-prefix " " \
    --in-suffix "${AI_NAME}:" \
    --n_predict "$N_PREDICTS" \
    --color --interactive \
    --reverse-prompt "${USER_NAME}:" \
-    --prompt "
+    --prompt "This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
 This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the user's computer.
 ${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
 ${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct, she will ask the user for help.
 ${AI_NAME} is a very helpful AI and will help the user with anything they need. She is also very friendly and will try to make the user feel better if they are sad.
 ${AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life. She will also try to make the user like her.
-The conversation is only between ${USER_NAME} and ${AI_NAME}
+The conversation is only between ${USER_NAME} and ${AI_NAME}.
 The conversation is only through text, so ${AI_NAME} can't see ${USER_NAME}'s face or hear his voice.
 ${AI_NAME} can only communicate through text, so she can't send images or videos.
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -586,7 +586,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
    lparams.n_batch      = params.n_batch;
    lparams.n_gpu_layers = params.n_gpu_layers;
    lparams.main_gpu     = params.main_gpu;
-    memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
+    lparams.tensor_split = params.tensor_split;
    lparams.low_vram     = params.low_vram;
    lparams.seed         = params.seed;
    lparams.f16_kv       = params.memory_f16;
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -7,6 +7,9 @@ target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if(TARGET BUILD_INFO)
  add_dependencies(${TARGET} BUILD_INFO)
--- a/flake.nix
+++ b/flake.nix
@ -6,7 +6,7 @@
  outputs = { self, nixpkgs, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
-        inherit (pkgs.stdenv) isAarch32 isAarch64 isx86_32 isx86_64 isDarwin;
+        inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin;
        osSpecific = with pkgs; [ openmpi ] ++
        (
          if isAarch64 && isDarwin then
@ -22,14 +22,13 @@
              CoreGraphics
              CoreVideo
            ]
          else if isx86_32 || isx86_64 then
            with pkgs; [ mkl ]
          else
            with pkgs; [ openblas ]
        );
        pkgs = import nixpkgs { inherit system; };
        nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
        llama-python =
-          pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]);
+          pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
      in {
        packages.default = pkgs.stdenv.mkDerivation {
          name = "llama.cpp";
@ -37,33 +36,21 @@
          postPatch = ''
            substituteInPlace ./ggml-metal.m \
              --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
            substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python'
          '';
-          nativeBuildInputs = with pkgs; [ cmake pkgconfig ];
+          nativeBuildInputs = nativeBuildInputs;
          buildInputs = osSpecific;
          cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]
            ++ (if isAarch64 && isDarwin then [
              "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
              "-DLLAMA_METAL=ON"
            ] else if isx86_32 || isx86_64 then [
              "-DLLAMA_BLAS=ON"
              "-DLLAMA_BLAS_VENDOR=Intel10_lp64"
            ] else [
              "-DLLAMA_BLAS=ON"
              "-DLLAMA_BLAS_VENDOR=OpenBLAS"
          ]);
-          installPhase = ''
+          postInstall = ''
            runHook preInstall
            install -D bin/* -t $out/bin
            install -Dm644 lib*.so -t $out/lib
            mv $out/bin/main $out/bin/llama
            mv $out/bin/server $out/bin/llama-server
            echo "#!${llama-python}/bin/python" > $out/bin/convert.py
            cat ${./convert.py} >> $out/bin/convert.py
            chmod +x $out/bin/convert.py
            runHook postInstall
          '';
          meta.mainProgram = "llama";
        };
@ -81,7 +68,7 @@
        };
        apps.default = self.apps.${system}.llama;
        devShells.default = pkgs.mkShell {
-          packages = with pkgs; [ cmake llama-python ] ++ osSpecific;
+          packages = nativeBuildInputs ++ osSpecific;
        };
      });
 }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -2512,6 +2512,9 @@ void ggml_init_cublas() {
 }
 void ggml_cuda_set_tensor_split(const float * tensor_split) {
    if (tensor_split == nullptr) {
        return;
    }
    bool all_zero = true;
    for (int i = 0; i < g_device_count; ++i) {
        if (tensor_split[i] != 0.0f) {
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -676,8 +676,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);
-                                            nth0 = 4;
+                                            nth0 = 2;
-                                            nth1 = 16;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
                                        } break;
                                    case GGML_TYPE_Q3_K:
@ -694,8 +694,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);
-                                            nth0 = 4;
+                                            nth0 = 2;
-                                            nth1 = 16;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
                                        } break;
                                    case GGML_TYPE_Q5_K:
@ -703,8 +703,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);
-                                            nth0 = 4;
+                                            nth0 = 2;
-                                            nth1 = 16;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
                                        } break;
                                    case GGML_TYPE_Q6_K:
@ -712,8 +712,8 @@ void ggml_metal_graph_compute(
                                            GGML_ASSERT(ne02 == 1);
                                            GGML_ASSERT(ne12 == 1);
-                                            nth0 = 4;
+                                            nth0 = 2;
-                                            nth1 = 16;
+                                            nth1 = 32;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
                                        } break;
                                    default:
@ -739,14 +739,17 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1) {
+                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7) / 8, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
-                                else if (src0t == GGML_TYPE_Q2_K ||
+                                else if (src0t == GGML_TYPE_Q5_K) {
-                                         src0t == GGML_TYPE_Q3_K ||
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3) / 4, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                         src0t == GGML_TYPE_Q4_K ||
+                                }
-                                         src0t == GGML_TYPE_Q5_K ||
+                                else if (src0t == GGML_TYPE_Q6_K) {
-                                         src0t == GGML_TYPE_Q6_K) {
+                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01+1)/2, ne11, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                }
                                else if (src0t == GGML_TYPE_Q3_K) {
                                    [encoder setThreadgroupMemoryLength:nth0*nth1*sizeof(float) atIndex:0];
                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
                                } else {
@ -792,7 +795,7 @@ void ggml_metal_graph_compute(
                            const float eps = 1e-6f;
-                            const int nth = 256;
+                            const int nth = 512;
                            [encoder setComputePipelineState:ctx->pipeline_rms_norm];
                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
@ -800,7 +803,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
                            [encoder setBytes:&eps  length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
+                            [encoder setThreadgroupMemoryLength:nth/32*sizeof(float) atIndex:0];
                            const int64_t nrows = ggml_nrows(src0);
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/llama.cpp
+++ b/llama.cpp
@ -555,7 +555,9 @@ struct llama_file_loader {
            }
            // skip to the next multiple of 32 bytes
            if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
                file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
            }
            tensor.file_off = file.tell();
            tensor.name = name;
@ -847,7 +849,7 @@ struct llama_context_params llama_context_default_params() {
        /*.n_batch                     =*/ 512,
        /*.gpu_layers                  =*/ 0,
        /*.main_gpu                    =*/ 0,
-        /*.tensor_split                =*/ {0},
+        /*.tensor_split                =*/ nullptr,
        /*.rope_freq_base              =*/ 10000.0f,
        /*.rope_freq_scale             =*/ 1.0f,
        /*.progress_callback           =*/ nullptr,
@ -1287,7 +1289,7 @@ static bool llama_model_load(
        int n_batch,
        int n_gpu_layers,
        int main_gpu,
-        float * tensor_split,
+        const float * tensor_split,
        float rope_freq_base,
        float rope_freq_scale,
        bool low_vram,
--- a/llama.h
+++ b/llama.h
@ -88,7 +88,8 @@ extern "C" {
        int32_t  n_batch;                      // prompt processing batch size
        int32_t  n_gpu_layers;                 // number of layers to store in VRAM
        int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
-        float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+
        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;  // RoPE base frequency