Removed junk, fixed some bugs and support dynamic number of sharded files
Merge remote-tracking branch 'origin/master' into concedo # Conflicts: # README.md
This commit is contained in:
commit
f952b7c613
14 changed files with 40 additions and 312 deletions
|
@ -1,17 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential python3 python3-pip
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
|
||||||
&& pip install torch torchvision torchaudio sentencepiece numpy
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN make
|
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
|
|
@ -1,18 +0,0 @@
|
||||||
ARG UBUNTU_VERSION=22.04
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
|
||||||
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y build-essential
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
|
|
||||||
RUN make
|
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
|
||||||
|
|
||||||
COPY --from=build /app/main /main
|
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
|
|
@ -1,46 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# Read the first argument into a variable
|
|
||||||
arg1="$1"
|
|
||||||
|
|
||||||
# Shift the arguments to remove the first one
|
|
||||||
shift
|
|
||||||
|
|
||||||
# Join the remaining arguments into a single string
|
|
||||||
arg2="$@"
|
|
||||||
|
|
||||||
if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
|
|
||||||
python3 ./convert-pth-to-ggml.py $arg2
|
|
||||||
elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
|
|
||||||
./quantize $arg2
|
|
||||||
elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
|
|
||||||
./main $arg2
|
|
||||||
elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
|
|
||||||
python3 ./download-pth.py $arg2
|
|
||||||
elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
|
|
||||||
echo "Downloading model..."
|
|
||||||
python3 ./download-pth.py "$1" "$2"
|
|
||||||
echo "Converting PTH to GGML..."
|
|
||||||
for i in `ls $1/$2/ggml-model-f16.bin*`; do
|
|
||||||
if [ -f "${i/f16/q4_0}" ]; then
|
|
||||||
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
|
|
||||||
else
|
|
||||||
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
|
|
||||||
./quantize "$i" "${i/f16/q4_0}" 2
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
else
|
|
||||||
echo "Unknown command: $arg1"
|
|
||||||
echo "Available commands: "
|
|
||||||
echo " --run (-r): Run a model previously converted into ggml"
|
|
||||||
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
|
|
||||||
echo " --convert (-c): Convert a llama model into ggml"
|
|
||||||
echo " ex: \"/models/7B/\" 1"
|
|
||||||
echo " --quantize (-q): Optimize with quantization process ggml"
|
|
||||||
echo " ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
|
|
||||||
echo " --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
|
|
||||||
echo " ex: \"/models/\" 7B"
|
|
||||||
echo " --all-in-one (-a): Execute --download, --convert & --quantize"
|
|
||||||
echo " ex: \"/models/\" 7B"
|
|
||||||
fi
|
|
131
CMakeLists.txt
131
CMakeLists.txt
|
@ -1,131 +0,0 @@
|
||||||
cmake_minimum_required(VERSION 3.8)
|
|
||||||
project("llama.cpp")
|
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 20)
|
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
|
||||||
set(CMAKE_C_STANDARD 11)
|
|
||||||
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
|
||||||
find_package(Threads REQUIRED)
|
|
||||||
|
|
||||||
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
|
||||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
|
||||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
|
||||||
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
|
|
||||||
|
|
||||||
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
|
|
||||||
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
|
|
||||||
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
|
||||||
|
|
||||||
if (APPLE)
|
|
||||||
option(LLAMA_NO_ACCELERATE "llama: disable Accelerate framework" OFF)
|
|
||||||
option(LLAMA_NO_AVX "llama: disable AVX" OFF)
|
|
||||||
option(LLAMA_NO_AVX2 "llama: disable AVX2" OFF)
|
|
||||||
option(LLAMA_NO_FMA "llama: disable FMA" OFF)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (NOT MSVC)
|
|
||||||
if (LLAMA_SANITIZE_THREAD)
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread")
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_SANITIZE_ADDRESS)
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_SANITIZE_UNDEFINED)
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined")
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (APPLE AND NOT LLAMA_NO_ACCELERATE)
|
|
||||||
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
|
||||||
if (ACCELERATE_FRAMEWORK)
|
|
||||||
message(STATUS "Accelerate framework found")
|
|
||||||
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
|
|
||||||
set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
|
|
||||||
else()
|
|
||||||
message(WARNING "Accelerate framework not found")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (LLAMA_ALL_WARNINGS)
|
|
||||||
if (NOT MSVC)
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
|
|
||||||
-Wall \
|
|
||||||
-Wextra \
|
|
||||||
-Wpedantic \
|
|
||||||
-Wshadow \
|
|
||||||
-Wcast-qual \
|
|
||||||
-Wstrict-prototypes \
|
|
||||||
-Wpointer-arith \
|
|
||||||
-Wno-unused-function \
|
|
||||||
")
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
|
|
||||||
-Wall \
|
|
||||||
-Wextra \
|
|
||||||
-Wpedantic \
|
|
||||||
-Wcast-qual \
|
|
||||||
")
|
|
||||||
else()
|
|
||||||
# todo : msvc
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
|
||||||
|
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
|
|
||||||
message(STATUS "ARM detected")
|
|
||||||
else()
|
|
||||||
message(STATUS "x86 detected")
|
|
||||||
if (MSVC)
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
|
|
||||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
|
|
||||||
else()
|
|
||||||
if(NOT LLAMA_NO_AVX)
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
|
|
||||||
endif()
|
|
||||||
if(NOT LLAMA_NO_AVX2)
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
|
|
||||||
endif()
|
|
||||||
if(NOT LLAMA_NO_FMA)
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
|
|
||||||
endif()
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# if (LLAMA_PERF)
|
|
||||||
# set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF)
|
|
||||||
# endif()
|
|
||||||
|
|
||||||
add_executable(llama
|
|
||||||
main.cpp
|
|
||||||
utils.cpp
|
|
||||||
utils.h)
|
|
||||||
|
|
||||||
add_executable(quantize
|
|
||||||
quantize.cpp
|
|
||||||
utils.cpp
|
|
||||||
utils.h)
|
|
||||||
|
|
||||||
add_library(ggml
|
|
||||||
ggml.c
|
|
||||||
ggml.h)
|
|
||||||
|
|
||||||
target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS})
|
|
||||||
target_compile_definitions(llama PUBLIC ${LLAMA_EXTRA_FLAGS})
|
|
||||||
target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS})
|
|
||||||
|
|
||||||
target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
|
|
||||||
target_include_directories(ggml PUBLIC .)
|
|
||||||
target_link_libraries(quantize PRIVATE ggml)
|
|
||||||
target_link_libraries(llama PRIVATE ggml)
|
|
||||||
target_link_libraries(ggml PRIVATE Threads::Threads)
|
|
2
Makefile
2
Makefile
|
@ -196,7 +196,7 @@ main: main.cpp ggml.o utils.o
|
||||||
./main -h
|
./main -h
|
||||||
|
|
||||||
llamalib: expose.cpp ggml.o utils.o
|
llamalib: expose.cpp ggml.o utils.o
|
||||||
$(CXX) $(CXXFLAGS) expose.cpp ggml.o utils.o -shared -o llamalib.dll $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) expose.cpp ggml.o utils.o -shared -o llamacpp.dll $(LDFLAGS)
|
||||||
|
|
||||||
quantize: quantize.cpp ggml.o utils.o
|
quantize: quantize.cpp ggml.o utils.o
|
||||||
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
|
||||||
|
|
|
@ -14,6 +14,6 @@ If you care, **please contribute to [this discussion](https://github.com/ggergan
|
||||||
- No external libraries or dependencies. That means no Flask, Pybind and whatever. All You Need Is Python.
|
- No external libraries or dependencies. That means no Flask, Pybind and whatever. All You Need Is Python.
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
- Windows binaries are provided in the form of **llamalib.dll** but if you feel worried go ahead and rebuild it yourself.
|
- Windows binaries are provided in the form of **llamacpp.dll** but if you feel worried go ahead and rebuild it yourself.
|
||||||
- Weights are not included, you can use the llama.cpp quantize.exe to generate them from your official weight files (or download them from...places).
|
- Weights are not included, you can use the llama.cpp quantize.exe to generate them from your official weight files (or download them from...places).
|
||||||
- To run, simply clone the repo and run `llama_for_kobold.py [ggml_quant_model.bin] [port]`, and then connect with Kobold or Kobold Lite.
|
- To run, simply clone the repo and run `llama_for_kobold.py [ggml_quant_model.bin] [port]`, and then connect with Kobold or Kobold Lite.
|
||||||
|
|
20
expose.cpp
20
expose.cpp
|
@ -17,6 +17,7 @@ extern "C" {
|
||||||
const int max_context_length;
|
const int max_context_length;
|
||||||
const int batch_size;
|
const int batch_size;
|
||||||
const char * model_filename;
|
const char * model_filename;
|
||||||
|
const int n_parts_overwrite = -1;
|
||||||
};
|
};
|
||||||
struct generation_inputs
|
struct generation_inputs
|
||||||
{
|
{
|
||||||
|
@ -48,7 +49,9 @@ extern "C" {
|
||||||
api_params.n_batch = inputs.batch_size;
|
api_params.n_batch = inputs.batch_size;
|
||||||
api_params.model = inputs.model_filename;
|
api_params.model = inputs.model_filename;
|
||||||
|
|
||||||
if (!llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx)) {
|
int n_parts_overwrite = inputs.n_parts_overwrite;
|
||||||
|
|
||||||
|
if (!llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx, n_parts_overwrite)) {
|
||||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, api_params.model.c_str());
|
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, api_params.model.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -67,11 +70,24 @@ extern "C" {
|
||||||
api_params.repeat_last_n = inputs.rep_pen_range;
|
api_params.repeat_last_n = inputs.rep_pen_range;
|
||||||
api_params.repeat_penalty = inputs.rep_pen;
|
api_params.repeat_penalty = inputs.rep_pen;
|
||||||
|
|
||||||
|
if(api_params.repeat_last_n<1)
|
||||||
|
{
|
||||||
|
api_params.repeat_last_n = 1;
|
||||||
|
}
|
||||||
|
if(api_params.top_k<1)
|
||||||
|
{
|
||||||
|
api_params.top_k = 300; //to disable top_k we actually need to increase this value to a very high number
|
||||||
|
}
|
||||||
if (api_params.seed < 0)
|
if (api_params.seed < 0)
|
||||||
{
|
{
|
||||||
api_params.seed = time(NULL);
|
api_params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//display usage
|
||||||
|
// std::string tst = " ";
|
||||||
|
// char * tst2 = (char*)tst.c_str();
|
||||||
|
// gpt_print_usage(1,&tst2,api_params);
|
||||||
|
|
||||||
api_params.prompt.insert(0, 1, ' ');
|
api_params.prompt.insert(0, 1, ' ');
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(api_vocab, api_params.prompt, true);
|
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(api_vocab, api_params.prompt, true);
|
||||||
|
@ -157,7 +173,7 @@ extern "C" {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("output: %s",concat_output.c_str());
|
//printf("output: %s",concat_output.c_str());
|
||||||
output.status = 1;
|
output.status = 1;
|
||||||
_snprintf_s(output.text,sizeof(output.text),_TRUNCATE,"%s",concat_output.c_str());
|
_snprintf_s(output.text,sizeof(output.text),_TRUNCATE,"%s",concat_output.c_str());
|
||||||
return output;
|
return output;
|
||||||
|
|
43
flake.lock
generated
43
flake.lock
generated
|
@ -1,43 +0,0 @@
|
||||||
{
|
|
||||||
"nodes": {
|
|
||||||
"flake-utils": {
|
|
||||||
"locked": {
|
|
||||||
"lastModified": 1676283394,
|
|
||||||
"narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
|
|
||||||
"owner": "numtide",
|
|
||||||
"repo": "flake-utils",
|
|
||||||
"rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
|
|
||||||
"type": "github"
|
|
||||||
},
|
|
||||||
"original": {
|
|
||||||
"owner": "numtide",
|
|
||||||
"repo": "flake-utils",
|
|
||||||
"type": "github"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nixpkgs": {
|
|
||||||
"locked": {
|
|
||||||
"lastModified": 1678470307,
|
|
||||||
"narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
|
|
||||||
"owner": "NixOS",
|
|
||||||
"repo": "nixpkgs",
|
|
||||||
"rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
|
|
||||||
"type": "github"
|
|
||||||
},
|
|
||||||
"original": {
|
|
||||||
"owner": "NixOS",
|
|
||||||
"ref": "nixos-unstable",
|
|
||||||
"repo": "nixpkgs",
|
|
||||||
"type": "github"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"root": {
|
|
||||||
"inputs": {
|
|
||||||
"flake-utils": "flake-utils",
|
|
||||||
"nixpkgs": "nixpkgs"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"root": "root",
|
|
||||||
"version": 7
|
|
||||||
}
|
|
48
flake.nix
48
flake.nix
|
@ -1,48 +0,0 @@
|
||||||
{
|
|
||||||
inputs = {
|
|
||||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
|
||||||
flake-utils.url = "github:numtide/flake-utils";
|
|
||||||
};
|
|
||||||
outputs = { self, nixpkgs, flake-utils }:
|
|
||||||
flake-utils.lib.eachDefaultSystem (system:
|
|
||||||
let
|
|
||||||
pkgs = import nixpkgs {
|
|
||||||
inherit system;
|
|
||||||
};
|
|
||||||
llama-python = pkgs.python310.withPackages (ps: with ps; [
|
|
||||||
torch
|
|
||||||
numpy
|
|
||||||
sentencepiece
|
|
||||||
]);
|
|
||||||
in
|
|
||||||
{
|
|
||||||
packages.default = pkgs.stdenv.mkDerivation {
|
|
||||||
name = "llama.cpp";
|
|
||||||
src = ./.;
|
|
||||||
nativeBuildInputs = with pkgs; [ cmake ];
|
|
||||||
buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
|
|
||||||
darwin.apple_sdk.frameworks.Accelerate
|
|
||||||
];
|
|
||||||
cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
|
|
||||||
"-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
|
|
||||||
];
|
|
||||||
installPhase = ''
|
|
||||||
mkdir -p $out/bin
|
|
||||||
mv llama $out/bin/llama
|
|
||||||
mv quantize $out/bin/quantize
|
|
||||||
echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
|
|
||||||
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
|
|
||||||
chmod +x $out/bin/convert-pth-to-ggml
|
|
||||||
'';
|
|
||||||
};
|
|
||||||
devShells.default = pkgs.mkShell {
|
|
||||||
packages = with pkgs; [
|
|
||||||
cmake
|
|
||||||
llama-python
|
|
||||||
] ++ lib.optionals stdenv.isDarwin [
|
|
||||||
darwin.apple_sdk.frameworks.Accelerate
|
|
||||||
];
|
|
||||||
};
|
|
||||||
}
|
|
||||||
);
|
|
||||||
}
|
|
|
@ -10,7 +10,8 @@ class load_model_inputs(ctypes.Structure):
|
||||||
_fields_ = [("threads", ctypes.c_int),
|
_fields_ = [("threads", ctypes.c_int),
|
||||||
("max_context_length", ctypes.c_int),
|
("max_context_length", ctypes.c_int),
|
||||||
("batch_size", ctypes.c_int),
|
("batch_size", ctypes.c_int),
|
||||||
("model_filename", ctypes.c_char_p)]
|
("model_filename", ctypes.c_char_p),
|
||||||
|
("n_parts_overwrite", ctypes.c_int)]
|
||||||
|
|
||||||
class generation_inputs(ctypes.Structure):
|
class generation_inputs(ctypes.Structure):
|
||||||
_fields_ = [("seed", ctypes.c_int),
|
_fields_ = [("seed", ctypes.c_int),
|
||||||
|
@ -27,19 +28,20 @@ class generation_outputs(ctypes.Structure):
|
||||||
("text", ctypes.c_char * 16384)]
|
("text", ctypes.c_char * 16384)]
|
||||||
|
|
||||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
handle = ctypes.CDLL(dir_path + "/llamalib.dll")
|
handle = ctypes.CDLL(dir_path + "/llamacpp.dll")
|
||||||
|
|
||||||
handle.load_model.argtypes = [load_model_inputs]
|
handle.load_model.argtypes = [load_model_inputs]
|
||||||
handle.load_model.restype = ctypes.c_bool
|
handle.load_model.restype = ctypes.c_bool
|
||||||
handle.generate.argtypes = [generation_inputs]
|
handle.generate.argtypes = [generation_inputs]
|
||||||
handle.generate.restype = generation_outputs
|
handle.generate.restype = generation_outputs
|
||||||
|
|
||||||
def load_model(model_filename,batch_size=8,max_context_length=512,threads=4):
|
def load_model(model_filename,batch_size=8,max_context_length=512,threads=4,n_parts_overwrite=-1):
|
||||||
inputs = load_model_inputs()
|
inputs = load_model_inputs()
|
||||||
inputs.model_filename = model_filename.encode("UTF-8")
|
inputs.model_filename = model_filename.encode("UTF-8")
|
||||||
inputs.batch_size = batch_size
|
inputs.batch_size = batch_size
|
||||||
inputs.max_context_length = max_context_length
|
inputs.max_context_length = max_context_length
|
||||||
inputs.threads = threads
|
inputs.threads = threads
|
||||||
|
inputs.n_parts_overwrite = n_parts_overwrite
|
||||||
ret = handle.load_model(inputs)
|
ret = handle.load_model(inputs)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
@ -233,9 +235,13 @@ if __name__ == '__main__':
|
||||||
print("Cannot find model file: " + sys.argv[1])
|
print("Cannot find model file: " + sys.argv[1])
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
|
mdl_nparts = 1
|
||||||
|
for n in range(1,9):
|
||||||
|
if os.path.exists(sys.argv[1]+"."+str(n)):
|
||||||
|
mdl_nparts += 1
|
||||||
modelname = os.path.abspath(sys.argv[1])
|
modelname = os.path.abspath(sys.argv[1])
|
||||||
print("Loading model: " + modelname)
|
print("Loading model: " + modelname)
|
||||||
loadok = load_model(modelname,128,maxctx,4)
|
loadok = load_model(modelname,128,maxctx,4,mdl_nparts)
|
||||||
print("Load Model OK: " + str(loadok))
|
print("Load Model OK: " + str(loadok))
|
||||||
|
|
||||||
if loadok:
|
if loadok:
|
||||||
|
|
BIN
llamacpp.dll
Normal file
BIN
llamacpp.dll
Normal file
Binary file not shown.
BIN
llamalib.dll
BIN
llamalib.dll
Binary file not shown.
11
main.cpp
11
main.cpp
|
@ -86,7 +86,7 @@ struct llama_model {
|
||||||
};
|
};
|
||||||
|
|
||||||
// load the model's weights from a file
|
// load the model's weights from a file
|
||||||
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
|
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, int n_parts_overwrite=-1) {
|
||||||
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
|
||||||
|
@ -132,6 +132,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
|
|
||||||
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||||
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
|
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
|
||||||
|
if(n_parts_overwrite>0)
|
||||||
|
{
|
||||||
|
n_parts = n_parts_overwrite;
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||||
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||||
|
@ -794,6 +798,11 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.n_ctx > 2048) {
|
||||||
|
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
if (params.seed < 0) {
|
if (params.seed < 0) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
|
|
BIN
main.exe
BIN
main.exe
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue