diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile deleted file mode 100644 index 618cdddc4..000000000 --- a/.devops/full.Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -FROM ubuntu:$UBUNTU_VERSION as build - -RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip - -RUN pip install --upgrade pip setuptools wheel \ - && pip install torch torchvision torchaudio sentencepiece numpy - -WORKDIR /app - -COPY . . - -RUN make - -ENTRYPOINT ["/app/.devops/tools.sh"] \ No newline at end of file diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile deleted file mode 100644 index cd575efa0..000000000 --- a/.devops/main.Dockerfile +++ /dev/null @@ -1,18 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -FROM ubuntu:$UBUNTU_VERSION as build - -RUN apt-get update && \ - apt-get install -y build-essential - -WORKDIR /app - -COPY . . - -RUN make - -FROM ubuntu:$UBUNTU_VERSION as runtime - -COPY --from=build /app/main /main - -ENTRYPOINT [ "/main" ] \ No newline at end of file diff --git a/.devops/tools.sh b/.devops/tools.sh deleted file mode 100755 index 352e04942..000000000 --- a/.devops/tools.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -set -e - -# Read the first argument into a variable -arg1="$1" - -# Shift the arguments to remove the first one -shift - -# Join the remaining arguments into a single string -arg2="$@" - -if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then - python3 ./convert-pth-to-ggml.py $arg2 -elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then - ./quantize $arg2 -elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then - ./main $arg2 -elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then - python3 ./download-pth.py $arg2 -elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then - echo "Downloading model..." - python3 ./download-pth.py "$1" "$2" - echo "Converting PTH to GGML..." - for i in `ls $1/$2/ggml-model-f16.bin*`; do - if [ -f "${i/f16/q4_0}" ]; then - echo "Skip model quantization, it already exists: ${i/f16/q4_0}" - else - echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..." - ./quantize "$i" "${i/f16/q4_0}" 2 - fi - done -else - echo "Unknown command: $arg1" - echo "Available commands: " - echo " --run (-r): Run a model previously converted into ggml" - echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" - echo " --convert (-c): Convert a llama model into ggml" - echo " ex: \"/models/7B/\" 1" - echo " --quantize (-q): Optimize with quantization process ggml" - echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2" - echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/" - echo " ex: \"/models/\" 7B" - echo " --all-in-one (-a): Execute --download, --convert & --quantize" - echo " ex: \"/models/\" 7B" -fi diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index 38e7266dc..000000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,131 +0,0 @@ -cmake_minimum_required(VERSION 3.8) -project("llama.cpp") - -set(CMAKE_CXX_STANDARD 20) -set(CMAKE_CXX_STANDARD_REQUIRED true) -set(CMAKE_C_STANDARD 11) -set(THREADS_PREFER_PTHREAD_FLAG ON) -find_package(Threads REQUIRED) - -if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) - set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") -endif() - -option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) -option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) - -option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) -option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) -option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) - -if (APPLE) - option(LLAMA_NO_ACCELERATE "llama: disable Accelerate framework" OFF) - option(LLAMA_NO_AVX "llama: disable AVX" OFF) - option(LLAMA_NO_AVX2 "llama: disable AVX2" OFF) - option(LLAMA_NO_FMA "llama: disable FMA" OFF) -endif() - -if (NOT MSVC) - if (LLAMA_SANITIZE_THREAD) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread") - endif() - - if (LLAMA_SANITIZE_ADDRESS) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer") - endif() - - if (LLAMA_SANITIZE_UNDEFINED) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined") - endif() -endif() - -if (APPLE AND NOT LLAMA_NO_ACCELERATE) - find_library(ACCELERATE_FRAMEWORK Accelerate) - if (ACCELERATE_FRAMEWORK) - message(STATUS "Accelerate framework found") - - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) - set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_USE_ACCELERATE) - else() - message(WARNING "Accelerate framework not found") - endif() -endif() - -if (LLAMA_ALL_WARNINGS) - if (NOT MSVC) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \ - -Wall \ - -Wextra \ - -Wpedantic \ - -Wshadow \ - -Wcast-qual \ - -Wstrict-prototypes \ - -Wpointer-arith \ - -Wno-unused-function \ - ") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \ - -Wall \ - -Wextra \ - -Wpedantic \ - -Wcast-qual \ - ") - else() - # todo : msvc - endif() -endif() - -message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") - -if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") - message(STATUS "ARM detected") -else() - message(STATUS "x86 detected") - if (MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2") - else() - if(NOT LLAMA_NO_AVX) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx") - endif() - if(NOT LLAMA_NO_AVX2) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") - endif() - if(NOT LLAMA_NO_FMA) - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") - endif() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c") - endif() -endif() - -# if (LLAMA_PERF) -# set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF) -# endif() - -add_executable(llama - main.cpp - utils.cpp - utils.h) - -add_executable(quantize - quantize.cpp - utils.cpp - utils.h) - -add_library(ggml - ggml.c - ggml.h) - -target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS}) -target_compile_definitions(llama PUBLIC ${LLAMA_EXTRA_FLAGS}) -target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS}) - -target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS}) -target_include_directories(ggml PUBLIC .) -target_link_libraries(quantize PRIVATE ggml) -target_link_libraries(llama PRIVATE ggml) -target_link_libraries(ggml PRIVATE Threads::Threads) diff --git a/Makefile b/Makefile index d64f65a4b..77dec0e0c 100644 --- a/Makefile +++ b/Makefile @@ -196,7 +196,7 @@ main: main.cpp ggml.o utils.o ./main -h llamalib: expose.cpp ggml.o utils.o - $(CXX) $(CXXFLAGS) expose.cpp ggml.o utils.o -shared -o llamalib.dll $(LDFLAGS) + $(CXX) $(CXXFLAGS) expose.cpp ggml.o utils.o -shared -o llamacpp.dll $(LDFLAGS) quantize: quantize.cpp ggml.o utils.o $(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS) diff --git a/README.md b/README.md index 0dbcb4707..e7d2c632c 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,6 @@ If you care, **please contribute to [this discussion](https://github.com/ggergan - No external libraries or dependencies. That means no Flask, Pybind and whatever. All You Need Is Python. ## Usage -- Windows binaries are provided in the form of **llamalib.dll** but if you feel worried go ahead and rebuild it yourself. +- Windows binaries are provided in the form of **llamacpp.dll** but if you feel worried go ahead and rebuild it yourself. - Weights are not included, you can use the llama.cpp quantize.exe to generate them from your official weight files (or download them from...places). - To run, simply clone the repo and run `llama_for_kobold.py [ggml_quant_model.bin] [port]`, and then connect with Kobold or Kobold Lite. diff --git a/expose.cpp b/expose.cpp index 0ca6b67d8..2df992bac 100644 --- a/expose.cpp +++ b/expose.cpp @@ -17,6 +17,7 @@ extern "C" { const int max_context_length; const int batch_size; const char * model_filename; + const int n_parts_overwrite = -1; }; struct generation_inputs { @@ -48,7 +49,9 @@ extern "C" { api_params.n_batch = inputs.batch_size; api_params.model = inputs.model_filename; - if (!llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx)) { + int n_parts_overwrite = inputs.n_parts_overwrite; + + if (!llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx, n_parts_overwrite)) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, api_params.model.c_str()); return false; } @@ -67,10 +70,23 @@ extern "C" { api_params.repeat_last_n = inputs.rep_pen_range; api_params.repeat_penalty = inputs.rep_pen; + if(api_params.repeat_last_n<1) + { + api_params.repeat_last_n = 1; + } + if(api_params.top_k<1) + { + api_params.top_k = 300; //to disable top_k we actually need to increase this value to a very high number + } if (api_params.seed < 0) { api_params.seed = time(NULL); } + + //display usage + // std::string tst = " "; + // char * tst2 = (char*)tst.c_str(); + // gpt_print_usage(1,&tst2,api_params); api_params.prompt.insert(0, 1, ' '); // tokenize the prompt @@ -157,7 +173,7 @@ extern "C" { } - printf("output: %s",concat_output.c_str()); + //printf("output: %s",concat_output.c_str()); output.status = 1; _snprintf_s(output.text,sizeof(output.text),_TRUNCATE,"%s",concat_output.c_str()); return output; diff --git a/flake.lock b/flake.lock deleted file mode 100644 index 343996da1..000000000 --- a/flake.lock +++ /dev/null @@ -1,43 +0,0 @@ -{ - "nodes": { - "flake-utils": { - "locked": { - "lastModified": 1676283394, - "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=", - "owner": "numtide", - "repo": "flake-utils", - "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073", - "type": "github" - }, - "original": { - "owner": "numtide", - "repo": "flake-utils", - "type": "github" - } - }, - "nixpkgs": { - "locked": { - "lastModified": 1678470307, - "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=", - "owner": "NixOS", - "repo": "nixpkgs", - "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f", - "type": "github" - }, - "original": { - "owner": "NixOS", - "ref": "nixos-unstable", - "repo": "nixpkgs", - "type": "github" - } - }, - "root": { - "inputs": { - "flake-utils": "flake-utils", - "nixpkgs": "nixpkgs" - } - } - }, - "root": "root", - "version": 7 -} diff --git a/flake.nix b/flake.nix deleted file mode 100644 index dae4ff60f..000000000 --- a/flake.nix +++ /dev/null @@ -1,48 +0,0 @@ -{ - inputs = { - nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; - flake-utils.url = "github:numtide/flake-utils"; - }; - outputs = { self, nixpkgs, flake-utils }: - flake-utils.lib.eachDefaultSystem (system: - let - pkgs = import nixpkgs { - inherit system; - }; - llama-python = pkgs.python310.withPackages (ps: with ps; [ - torch - numpy - sentencepiece - ]); - in - { - packages.default = pkgs.stdenv.mkDerivation { - name = "llama.cpp"; - src = ./.; - nativeBuildInputs = with pkgs; [ cmake ]; - buildInputs = with pkgs; lib.optionals stdenv.isDarwin [ - darwin.apple_sdk.frameworks.Accelerate - ]; - cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [ - "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" - ]; - installPhase = '' - mkdir -p $out/bin - mv llama $out/bin/llama - mv quantize $out/bin/quantize - echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml - cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml - chmod +x $out/bin/convert-pth-to-ggml - ''; - }; - devShells.default = pkgs.mkShell { - packages = with pkgs; [ - cmake - llama-python - ] ++ lib.optionals stdenv.isDarwin [ - darwin.apple_sdk.frameworks.Accelerate - ]; - }; - } - ); -} diff --git a/llama_for_kobold.py b/llama_for_kobold.py index 333b81094..be02d5022 100644 --- a/llama_for_kobold.py +++ b/llama_for_kobold.py @@ -10,7 +10,8 @@ class load_model_inputs(ctypes.Structure): _fields_ = [("threads", ctypes.c_int), ("max_context_length", ctypes.c_int), ("batch_size", ctypes.c_int), - ("model_filename", ctypes.c_char_p)] + ("model_filename", ctypes.c_char_p), + ("n_parts_overwrite", ctypes.c_int)] class generation_inputs(ctypes.Structure): _fields_ = [("seed", ctypes.c_int), @@ -27,19 +28,20 @@ class generation_outputs(ctypes.Structure): ("text", ctypes.c_char * 16384)] dir_path = os.path.dirname(os.path.realpath(__file__)) -handle = ctypes.CDLL(dir_path + "/llamalib.dll") +handle = ctypes.CDLL(dir_path + "/llamacpp.dll") handle.load_model.argtypes = [load_model_inputs] handle.load_model.restype = ctypes.c_bool handle.generate.argtypes = [generation_inputs] handle.generate.restype = generation_outputs -def load_model(model_filename,batch_size=8,max_context_length=512,threads=4): +def load_model(model_filename,batch_size=8,max_context_length=512,threads=4,n_parts_overwrite=-1): inputs = load_model_inputs() inputs.model_filename = model_filename.encode("UTF-8") inputs.batch_size = batch_size inputs.max_context_length = max_context_length inputs.threads = threads + inputs.n_parts_overwrite = n_parts_overwrite ret = handle.load_model(inputs) return ret @@ -233,9 +235,13 @@ if __name__ == '__main__': print("Cannot find model file: " + sys.argv[1]) exit() + mdl_nparts = 1 + for n in range(1,9): + if os.path.exists(sys.argv[1]+"."+str(n)): + mdl_nparts += 1 modelname = os.path.abspath(sys.argv[1]) print("Loading model: " + modelname) - loadok = load_model(modelname,128,maxctx,4) + loadok = load_model(modelname,128,maxctx,4,mdl_nparts) print("Load Model OK: " + str(loadok)) if loadok: diff --git a/llamacpp.dll b/llamacpp.dll new file mode 100644 index 000000000..baac468e7 Binary files /dev/null and b/llamacpp.dll differ diff --git a/llamalib.dll b/llamalib.dll deleted file mode 100644 index d0fc07eb9..000000000 Binary files a/llamalib.dll and /dev/null differ diff --git a/main.cpp b/main.cpp index 3bef985ac..4c9b2ff9b 100644 --- a/main.cpp +++ b/main.cpp @@ -86,7 +86,7 @@ struct llama_model { }; // load the model's weights from a file -bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { +bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, int n_parts_overwrite=-1) { fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); @@ -132,6 +132,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; n_parts = LLAMA_N_PARTS.at(hparams.n_embd); + if(n_parts_overwrite>0) + { + n_parts = n_parts_overwrite; + } fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); @@ -793,6 +797,11 @@ int main(int argc, char ** argv) { if (gpt_params_parse(argc, argv, params) == false) { return 1; } + + if (params.n_ctx > 2048) { + fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" + "expect poor results\n", __func__, params.n_ctx); + } if (params.seed < 0) { params.seed = time(NULL); diff --git a/main.exe b/main.exe index f66748c2c..2c9ab1201 100644 Binary files a/main.exe and b/main.exe differ