diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
deleted file mode 100644
index 618cdddc4..000000000
--- a/.devops/full.Dockerfile
+++ /dev/null
@@ -1,17 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-FROM ubuntu:$UBUNTU_VERSION as build
-
-RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip
-
-RUN pip install --upgrade pip setuptools wheel \
-    && pip install torch torchvision torchaudio sentencepiece numpy
-
-WORKDIR /app
-
-COPY . .
-
-RUN make
-
-ENTRYPOINT ["/app/.devops/tools.sh"]
\ No newline at end of file
diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile
deleted file mode 100644
index cd575efa0..000000000
--- a/.devops/main.Dockerfile
+++ /dev/null
@@ -1,18 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-
-FROM ubuntu:$UBUNTU_VERSION as build
-
-RUN apt-get update && \
-    apt-get install -y build-essential
-
-WORKDIR /app
-
-COPY . .
-
-RUN make
-
-FROM ubuntu:$UBUNTU_VERSION as runtime
-
-COPY --from=build /app/main /main
-
-ENTRYPOINT [ "/main" ]
\ No newline at end of file
diff --git a/.devops/tools.sh b/.devops/tools.sh
deleted file mode 100755
index 352e04942..000000000
--- a/.devops/tools.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-set -e
-
-# Read the first argument into a variable
-arg1="$1"
-
-# Shift the arguments to remove the first one
-shift
-
-# Join the remaining arguments into a single string
-arg2="$@"
-
-if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
-    python3 ./convert-pth-to-ggml.py $arg2
-elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
-    ./quantize $arg2
-elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
-    ./main $arg2
-elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
-    python3 ./download-pth.py $arg2
-elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
-    echo "Downloading model..."
-    python3 ./download-pth.py "$1" "$2"
-    echo "Converting PTH to GGML..."
-    for i in `ls $1/$2/ggml-model-f16.bin*`; do
-        if [ -f "${i/f16/q4_0}" ]; then
-            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
-        else
-            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./quantize "$i" "${i/f16/q4_0}" 2
-        fi
-    done
-else
-    echo "Unknown command: $arg1"
-    echo "Available commands: "
-    echo "  --run (-r): Run a model previously converted into ggml"
-    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
-    echo "  --convert (-c): Convert a llama model into ggml"
-    echo "              ex: \"/models/7B/\" 1"
-    echo "  --quantize (-q): Optimize with quantization process ggml"
-    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
-    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
-    echo "              ex: \"/models/\" 7B"
-    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
-    echo "              ex: \"/models/\" 7B"
-fi
diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index 38e7266dc..000000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,131 +0,0 @@
-cmake_minimum_required(VERSION 3.8)
-project("llama.cpp")
-
-set(CMAKE_CXX_STANDARD 20)
-set(CMAKE_CXX_STANDARD_REQUIRED true)
-set(CMAKE_C_STANDARD 11)
-set(THREADS_PREFER_PTHREAD_FLAG ON)
-find_package(Threads REQUIRED)
-
-if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
-    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
-endif()
-
-option(LLAMA_ALL_WARNINGS            "llama: enable all compiler warnings"                   ON)
-option(LLAMA_ALL_WARNINGS_3RD_PARTY  "llama: enable all compiler warnings in 3rd party libs" OFF)
-
-option(LLAMA_SANITIZE_THREAD         "llama: enable thread sanitizer"    OFF)
-option(LLAMA_SANITIZE_ADDRESS        "llama: enable address sanitizer"   OFF)
-option(LLAMA_SANITIZE_UNDEFINED      "llama: enable undefined sanitizer" OFF)
-
-if (APPLE)
-    option(LLAMA_NO_ACCELERATE       "llama: disable Accelerate framework" OFF)
-    option(LLAMA_NO_AVX              "llama: disable AVX" OFF)
-    option(LLAMA_NO_AVX2             "llama: disable AVX2" OFF)
-    option(LLAMA_NO_FMA              "llama: disable FMA" OFF)
-endif()
-
-if (NOT MSVC)
-    if (LLAMA_SANITIZE_THREAD)
-        set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fsanitize=thread")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
-    endif()
-
-    if (LLAMA_SANITIZE_ADDRESS)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
-    endif()
-
-    if (LLAMA_SANITIZE_UNDEFINED)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=undefined")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
-    endif()
-endif()
-
-if (APPLE AND NOT LLAMA_NO_ACCELERATE)
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (ACCELERATE_FRAMEWORK)
-        message(STATUS "Accelerate framework found")
-
-        set(LLAMA_EXTRA_LIBS  ${LLAMA_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
-        set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
-    else()
-        message(WARNING "Accelerate framework not found")
-    endif()
-endif()
-
-if (LLAMA_ALL_WARNINGS)
-    if (NOT MSVC)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
-            -Wall                           \
-            -Wextra                         \
-            -Wpedantic                      \
-            -Wshadow                        \
-            -Wcast-qual                     \
-            -Wstrict-prototypes             \
-            -Wpointer-arith                 \
-            -Wno-unused-function            \
-        ")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
-            -Wall                           \
-            -Wextra                         \
-            -Wpedantic                      \
-            -Wcast-qual                     \
-        ")
-    else()
-        # todo : msvc
-    endif()
-endif()
-
-message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
-
-if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
-    message(STATUS "ARM detected")
-else()
-    message(STATUS "x86 detected")
-    if (MSVC)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
-        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
-    else()
-        if(NOT LLAMA_NO_AVX)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-        endif()
-        if(NOT LLAMA_NO_AVX2)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
-        endif()
-        if(NOT LLAMA_NO_FMA)
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
-        endif()
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-    endif()
-endif()
-
-# if (LLAMA_PERF)
-#     set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF)
-# endif()
-
-add_executable(llama
-    main.cpp
-    utils.cpp
-    utils.h)
-
-add_executable(quantize
-    quantize.cpp
-    utils.cpp
-    utils.h)
-
-add_library(ggml
-    ggml.c
-    ggml.h)
-
-target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS})
-target_compile_definitions(llama PUBLIC ${LLAMA_EXTRA_FLAGS})
-target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS})
-
-target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
-target_include_directories(ggml PUBLIC .)
-target_link_libraries(quantize PRIVATE ggml)
-target_link_libraries(llama PRIVATE ggml)
-target_link_libraries(ggml PRIVATE Threads::Threads)
diff --git a/Makefile b/Makefile
index d64f65a4b..77dec0e0c 100644
--- a/Makefile
+++ b/Makefile
@@ -196,7 +196,7 @@ main: main.cpp ggml.o utils.o
 	./main -h
 	
 llamalib: expose.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) expose.cpp ggml.o utils.o -shared -o llamalib.dll $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) expose.cpp ggml.o utils.o -shared -o llamacpp.dll $(LDFLAGS)
 
 quantize: quantize.cpp ggml.o utils.o
 	$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
diff --git a/README.md b/README.md
index 0dbcb4707..e7d2c632c 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,6 @@ If you care, **please contribute to [this discussion](https://github.com/ggergan
 - No external libraries or dependencies. That means no Flask, Pybind and whatever. All You Need Is Python.
 
 ## Usage
-- Windows binaries are provided in the form of **llamalib.dll** but if you feel worried go ahead and rebuild it yourself.
+- Windows binaries are provided in the form of **llamacpp.dll** but if you feel worried go ahead and rebuild it yourself.
 - Weights are not included, you can use the llama.cpp quantize.exe to generate them from your official weight files (or download them from...places).
 - To run, simply clone the repo and run `llama_for_kobold.py [ggml_quant_model.bin] [port]`, and then connect with Kobold or Kobold Lite.
diff --git a/expose.cpp b/expose.cpp
index 0ca6b67d8..2df992bac 100644
--- a/expose.cpp
+++ b/expose.cpp
@@ -17,6 +17,7 @@ extern "C" {
         const int max_context_length;
         const int batch_size;
         const char * model_filename;
+        const int n_parts_overwrite = -1;
     };
     struct generation_inputs
     {
@@ -48,7 +49,9 @@ extern "C" {
         api_params.n_batch = inputs.batch_size;
         api_params.model = inputs.model_filename;
 
-         if (!llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx)) {  
+        int n_parts_overwrite =  inputs.n_parts_overwrite;
+
+        if (!llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx, n_parts_overwrite)) {  
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, api_params.model.c_str());
             return false;
         }
@@ -67,10 +70,23 @@ extern "C" {
         api_params.repeat_last_n = inputs.rep_pen_range;
         api_params.repeat_penalty = inputs.rep_pen;
       
+        if(api_params.repeat_last_n<1)
+        {
+            api_params.repeat_last_n = 1;
+        }
+        if(api_params.top_k<1)
+        {
+            api_params.top_k = 300; //to disable top_k we actually need to increase this value to a very high number
+        }
         if (api_params.seed < 0)
         {
             api_params.seed = time(NULL);
         }
+
+        //display usage
+        // std::string tst = " ";
+        // char * tst2 = (char*)tst.c_str();
+        // gpt_print_usage(1,&tst2,api_params);
         
         api_params.prompt.insert(0, 1, ' ');
         // tokenize the prompt
@@ -157,7 +173,7 @@ extern "C" {
             
         }
 
-        printf("output: %s",concat_output.c_str());
+        //printf("output: %s",concat_output.c_str());
         output.status = 1;
         _snprintf_s(output.text,sizeof(output.text),_TRUNCATE,"%s",concat_output.c_str());
         return output;
diff --git a/flake.lock b/flake.lock
deleted file mode 100644
index 343996da1..000000000
--- a/flake.lock
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-  "nodes": {
-    "flake-utils": {
-      "locked": {
-        "lastModified": 1676283394,
-        "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1678470307,
-        "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
-        "owner": "NixOS",
-        "repo": "nixpkgs",
-        "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
-        "type": "github"
-      },
-      "original": {
-        "owner": "NixOS",
-        "ref": "nixos-unstable",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "root": {
-      "inputs": {
-        "flake-utils": "flake-utils",
-        "nixpkgs": "nixpkgs"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}
diff --git a/flake.nix b/flake.nix
deleted file mode 100644
index dae4ff60f..000000000
--- a/flake.nix
+++ /dev/null
@@ -1,48 +0,0 @@
-{
-  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
-    flake-utils.url = "github:numtide/flake-utils";
-  };
-  outputs = { self, nixpkgs, flake-utils }:
-    flake-utils.lib.eachDefaultSystem (system:
-      let
-        pkgs = import nixpkgs {
-          inherit system;
-        };
-        llama-python = pkgs.python310.withPackages (ps: with ps; [
-          torch
-          numpy
-          sentencepiece
-        ]);
-      in
-      {
-        packages.default = pkgs.stdenv.mkDerivation {
-          name = "llama.cpp";
-          src = ./.;
-          nativeBuildInputs = with pkgs; [ cmake ];
-          buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
-            darwin.apple_sdk.frameworks.Accelerate
-          ];
-          cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
-            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
-          ];
-          installPhase = ''
-            mkdir -p $out/bin
-            mv llama $out/bin/llama
-            mv quantize $out/bin/quantize
-            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
-            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
-            chmod +x $out/bin/convert-pth-to-ggml
-          '';
-        };
-        devShells.default = pkgs.mkShell {
-          packages = with pkgs; [
-            cmake
-            llama-python
-          ] ++ lib.optionals stdenv.isDarwin [
-            darwin.apple_sdk.frameworks.Accelerate
-          ];
-        };
-      }
-    );
-}
diff --git a/llama_for_kobold.py b/llama_for_kobold.py
index 333b81094..be02d5022 100644
--- a/llama_for_kobold.py
+++ b/llama_for_kobold.py
@@ -10,7 +10,8 @@ class load_model_inputs(ctypes.Structure):
     _fields_ = [("threads", ctypes.c_int),
                 ("max_context_length", ctypes.c_int),
                 ("batch_size", ctypes.c_int),
-                ("model_filename", ctypes.c_char_p)]
+                ("model_filename", ctypes.c_char_p),
+                ("n_parts_overwrite", ctypes.c_int)]
 
 class generation_inputs(ctypes.Structure):
     _fields_ = [("seed", ctypes.c_int),
@@ -27,19 +28,20 @@ class generation_outputs(ctypes.Structure):
                 ("text", ctypes.c_char * 16384)]
 
 dir_path = os.path.dirname(os.path.realpath(__file__))
-handle = ctypes.CDLL(dir_path + "/llamalib.dll")     
+handle = ctypes.CDLL(dir_path + "/llamacpp.dll")     
 
 handle.load_model.argtypes = [load_model_inputs] 
 handle.load_model.restype = ctypes.c_bool
 handle.generate.argtypes = [generation_inputs]
 handle.generate.restype = generation_outputs
   
-def load_model(model_filename,batch_size=8,max_context_length=512,threads=4):
+def load_model(model_filename,batch_size=8,max_context_length=512,threads=4,n_parts_overwrite=-1):
     inputs = load_model_inputs()
     inputs.model_filename = model_filename.encode("UTF-8")
     inputs.batch_size = batch_size
     inputs.max_context_length = max_context_length
     inputs.threads = threads
+    inputs.n_parts_overwrite = n_parts_overwrite
     ret = handle.load_model(inputs)
     return ret
 
@@ -233,9 +235,13 @@ if __name__ == '__main__':
         print("Cannot find model file: " + sys.argv[1])
         exit()
 
+    mdl_nparts = 1
+    for n in range(1,9):
+        if os.path.exists(sys.argv[1]+"."+str(n)):
+            mdl_nparts += 1
     modelname = os.path.abspath(sys.argv[1])
     print("Loading model: " + modelname)
-    loadok = load_model(modelname,128,maxctx,4)
+    loadok = load_model(modelname,128,maxctx,4,mdl_nparts)
     print("Load Model OK: " + str(loadok))
 
     if loadok:
diff --git a/llamacpp.dll b/llamacpp.dll
new file mode 100644
index 000000000..baac468e7
Binary files /dev/null and b/llamacpp.dll differ
diff --git a/llamalib.dll b/llamalib.dll
deleted file mode 100644
index d0fc07eb9..000000000
Binary files a/llamalib.dll and /dev/null differ
diff --git a/main.cpp b/main.cpp
index 3bef985ac..4c9b2ff9b 100644
--- a/main.cpp
+++ b/main.cpp
@@ -86,7 +86,7 @@ struct llama_model {
 };
 
 // load the model's weights from a file
-bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
+bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, int n_parts_overwrite=-1) {
     fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
 
@@ -132,6 +132,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
 
         n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
         n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
+        if(n_parts_overwrite>0)
+        {
+            n_parts = n_parts_overwrite;
+        }
 
         fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
         fprintf(stderr, "%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
@@ -793,6 +797,11 @@ int main(int argc, char ** argv) {
     if (gpt_params_parse(argc, argv, params) == false) {
         return 1;
     }
+    
+    if (params.n_ctx > 2048) {
+        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+                "expect poor results\n", __func__, params.n_ctx);
+    }
 
     if (params.seed < 0) {
         params.seed = time(NULL);
diff --git a/main.exe b/main.exe
index f66748c2c..2c9ab1201 100644
Binary files a/main.exe and b/main.exe differ