clean and refactor handling of flags

This commit is contained in:
Concedo 2023-04-12 23:25:31 +08:00
commit 1bd5992da4
10 changed files with 106 additions and 245 deletions

View file

@ -123,14 +123,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
CFLAGS += -mfp16-format=ieee -mno-unaligned-access CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif endif
OPENBLAS_BUILD = OPENBLAS_BUILD =
CLBLAST_BUILD = CLBLAST_BUILD =
OPENBLAS_NOAVX2_BUILD = NOAVX2_BUILD =
OPENBLAS_NOAVX2_BUILD =
ifeq ($(OS),Windows_NT) ifeq ($(OS),Windows_NT)
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas.dll $(LDFLAGS) OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas.dll $(LDFLAGS)
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/OpenCL.lib lib/clblast.lib -shared -o koboldcpp_clblast.dll $(LDFLAGS) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/OpenCL.lib lib/clblast.lib -shared -o koboldcpp_clblast.dll $(LDFLAGS)
OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas_noavx2.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas_noavx2.dll $(LDFLAGS) OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas_noavx2.dll $(LDFLAGS)
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o -shared -o koboldcpp_noavx2.dll $(LDFLAGS)
else else
ifndef LLAMA_OPENBLAS ifndef LLAMA_OPENBLAS
ifndef LLAMA_CLBLAST ifndef LLAMA_CLBLAST
@ -154,7 +156,7 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV)) $(info I CXX: $(CXXV))
$(info ) $(info )
default: llamalib llamalib_openblas llamalib_openblas_noavx2 llamalib_clblast default: llamalib llamalib_noavx2 llamalib_openblas llamalib_openblas_noavx2 llamalib_clblast
# #
# Build library # Build library
@ -166,6 +168,9 @@ ggml.o: ggml.c ggml.h
ggml_openblas.o: ggml.c ggml.h ggml_openblas.o: ggml.c ggml.h
$(CC) $(CFLAGS) $(BONUSCFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas.o $(CC) $(CFLAGS) $(BONUSCFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas.o
ggml_noavx2.o: ggml.c ggml.h
$(CC) $(CFLAGS) -c ggml.c -o ggml_noavx2.o
ggml_openblas_noavx2.o: ggml.c ggml.h ggml_openblas_noavx2.o: ggml.c ggml.h
$(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas_noavx2.o $(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas_noavx2.o
@ -176,7 +181,7 @@ ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
$(CC) $(CFLAGS) $(BONUSCFLAGS) -c otherarch/ggml_v1.c -o ggml_v1.o $(CC) $(CFLAGS) $(BONUSCFLAGS) -c otherarch/ggml_v1.c -o ggml_v1.o
ggml_v1_noavx2.o: otherarch/ggml_v1.c otherarch/ggml_v1.h ggml_v1_noavx2.o: otherarch/ggml_v1.c otherarch/ggml_v1.h
$(CC) $(CFLAGS) -c otherarch/ggml_v1.c -o ggml_v1.o $(CC) $(CFLAGS) -c otherarch/ggml_v1.c -o ggml_v1_noavx2.o
llama.o: llama.cpp llama.h llama_internal.h llama.o: llama.cpp llama.h llama_internal.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
@ -194,7 +199,7 @@ gpttype_adapter.o:
$(CXX) $(CXXFLAGS) -c gpttype_adapter.cpp -o gpttype_adapter.o $(CXX) $(CXXFLAGS) -c gpttype_adapter.cpp -o gpttype_adapter.o
clean: clean:
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize-stats perplexity embedding main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll gptj.exe gpt2.exe rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize-stats perplexity embedding main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_noavx2.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll gptj.exe gpt2.exe
main: examples/main/main.cpp ggml.o llama.o common.o main: examples/main/main.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@ -208,6 +213,9 @@ llamalib: ggml.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
llamalib_openblas: ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o llamalib_openblas: ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o
$(OPENBLAS_BUILD) $(OPENBLAS_BUILD)
llamalib_noavx2: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o
$(NOAVX2_BUILD)
llamalib_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o llamalib_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o
$(OPENBLAS_NOAVX2_BUILD) $(OPENBLAS_NOAVX2_BUILD)

View file

@ -31,9 +31,15 @@ extern "C"
std::string model = inputs.model_filename; std::string model = inputs.model_filename;
file_format = check_file_format(model.c_str()); file_format = check_file_format(model.c_str());
//first digit is platform, second is devices //first digit is whether configured, second is platform, third is devices
int platform = inputs.clblast_info/10; int parseinfo = inputs.clblast_info;
int devices = inputs.clblast_info%10;
std::string usingclblast = "KCPP_CLBLAST_CONFIGURED="+std::to_string(parseinfo>0?1:0);
putenv((char*)usingclblast.c_str());
parseinfo = parseinfo%100; //keep last 2 digits
int platform = parseinfo/10;
int devices = parseinfo%10;
std::string platformenv = "KCPP_CLBLAST_PLATFORM="+std::to_string(platform); std::string platformenv = "KCPP_CLBLAST_PLATFORM="+std::to_string(platform);
std::string deviceenv = "KCPP_CLBLAST_DEVICES="+std::to_string(devices); std::string deviceenv = "KCPP_CLBLAST_DEVICES="+std::to_string(devices);
putenv((char*)platformenv.c_str()); putenv((char*)platformenv.c_str());

View file

@ -42,13 +42,17 @@ def init_library():
global handle, use_blas, use_clblast, use_noavx2 global handle, use_blas, use_clblast, use_noavx2
libname = "" libname = ""
if use_noavx2: if use_noavx2:
libname = "koboldcpp_openblas_noavx2.dll" if use_blas:
elif use_blas: libname = "koboldcpp_openblas_noavx2.dll"
libname = "koboldcpp_openblas.dll" else:
elif use_clblast: libname = "koboldcpp_noavx2.dll"
libname = "koboldcpp_clblast.dll"
else: else:
libname = "koboldcpp.dll" if use_clblast:
libname = "koboldcpp_clblast.dll"
elif use_blas:
libname = "koboldcpp_openblas.dll"
else:
libname = "koboldcpp.dll"
print("Initializing dynamic library: " + libname) print("Initializing dynamic library: " + libname)
dir_path = os.path.dirname(os.path.realpath(__file__)) dir_path = os.path.dirname(os.path.realpath(__file__))
@ -72,7 +76,7 @@ def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwr
inputs.use_mmap = use_mmap inputs.use_mmap = use_mmap
clblastids = 0 clblastids = 0
if args.useclblast: if args.useclblast:
clblastids = int(args.useclblast[0])*10 + int(args.useclblast[1]) clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
inputs.clblast_info = clblastids inputs.clblast_info = clblastids
ret = handle.load_model(inputs) ret = handle.load_model(inputs)
return ret return ret
@ -313,30 +317,36 @@ def RunServerMultiThreaded(addr, port, embedded_kailite = None):
def main(args): def main(args):
global use_blas, use_clblast, use_noavx2 global use_blas, use_clblast, use_noavx2
if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas.dll")): use_blas = False
print("Warning: libopenblas.dll or koboldcpp_openblas.dll not found. Non-BLAS library will be used. Ignore this if you have manually linked with OpenBLAS.") use_clblast = False
use_blas = False use_noavx2 = False
elif os.name != 'nt':
print("Prebuilt OpenBLAS binaries only available for windows. Please manually build/link libopenblas from makefile with LLAMA_OPENBLAS=1") if os.name != 'nt':
use_blas = False print("You are not on Windows. Default koboldcpp.dll library file will be used. Remember to manually link with OpenBLAS using LLAMA_OPENBLAS=1, or CLBlast with LLAMA_CLBLAST=1 if you want to use them.")
elif args.noavx2:
use_noavx2 = True
if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas_noavx2.dll")):
print("Warning: libopenblas.dll or koboldcpp_openblas_noavx2.dll not found. Non-BLAS library will be used.")
elif args.noblas:
print("Attempting to use non-avx2 compatibility library without OpenBLAS.")
else:
use_blas = True
print("Attempting to use non-avx2 compatibility library with OpenBLAS.")
elif args.useclblast: elif args.useclblast:
if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "clblast.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_clblast.dll")): if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "clblast.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_clblast.dll")):
print("Warning: clblast.dll or koboldcpp_clblast.dll not found. Non-BLAS library will be used. Ignore this if you have manually linked with CLBlast.") print("Warning: clblast.dll or koboldcpp_clblast.dll not found. Non-BLAS library will be used. Ignore this if you have manually linked with CLBlast.")
else: else:
print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast.dll will be required.") print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast.dll will be required.")
use_clblast = True use_clblast = True
elif args.noavx2: else:
if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas_noavx2.dll")): if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas.dll")):
print("Warning: libopenblas.dll or koboldcpp_openblas_noavx2.dll not found. This mode cannot be used.") print("Warning: libopenblas.dll or koboldcpp_openblas.dll not found. Non-BLAS library will be used.")
elif os.name == 'nt': elif args.noblas:
print("Attempting to use non-avx2 compatibility openblas library.") print("Attempting to library without OpenBLAS.")
use_noavx2 = True
else: else:
print("Non-AVX2 compatibility OpenBLAS mode only available on windows. On other OS, please manually rebuild without AVX2 flags.") use_blas = True
elif not args.noblas: print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas.dll will be required.")
print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas.dll will be required.")
use_blas = True
if args.psutil_set_threads: if args.psutil_set_threads:
import psutil import psutil
args.threads = psutil.cpu_count(logical=False) args.threads = psutil.cpu_count(logical=False)
@ -421,9 +431,9 @@ if __name__ == '__main__':
parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true') parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true')
parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true') parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true')
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true') parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --clblast.", action='store_true')
compatgroup = parser.add_mutually_exclusive_group() compatgroup = parser.add_mutually_exclusive_group()
compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
compatgroup.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --noblas or --clblast.", action='store_true')
compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)

View file

@ -1949,4 +1949,4 @@ const char * llama_print_system_info(void) {
// For internal test use // For internal test use
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) { std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx) {
return ctx->model.tensors_by_name; return ctx->model.tensors_by_name;
} }

View file

@ -7,6 +7,11 @@
//No dynamic memory allocation! Setup structs with FIXED (known) shapes and sizes for ALL output fields //No dynamic memory allocation! Setup structs with FIXED (known) shapes and sizes for ALL output fields
//Python will ALWAYS provide the memory, we just write to it. //Python will ALWAYS provide the memory, we just write to it.
// Defines sigaction on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include <time.h> #include <time.h>
#include "./examples/main/main.cpp" #include "./examples/main/main.cpp"
#include "ggml.h" #include "ggml.h"
@ -38,11 +43,12 @@ bool llama_load_model(const load_model_inputs inputs, FileFormat in_file_format)
modelname = inputs.model_filename; modelname = inputs.model_filename;
ctx_params.n_ctx = inputs.max_context_length; ctx_params.n_ctx = inputs.max_context_length;
ctx_params.n_parts = inputs.n_parts_overwrite; ctx_params.n_parts = -1;//inputs.n_parts_overwrite;
ctx_params.seed = -1; ctx_params.seed = -1;
ctx_params.f16_kv = inputs.f16_kv; ctx_params.f16_kv = inputs.f16_kv;
ctx_params.logits_all = false; ctx_params.logits_all = false;
ctx_params.use_mmap = inputs.use_mmap; ctx_params.use_mmap = inputs.use_mmap;
ctx_params.use_mlock = false;
file_format = in_file_format; file_format = in_file_format;

View file

@ -2,22 +2,6 @@
#include "llamaextra.h" #include "llamaextra.h"
#include "llama.cpp" #include "llama.cpp"
#include <cassert>
#include <cstring>
#include <fstream>
#include <regex>
#include <iostream>
#include <iterator>
#include <queue>
#include <string>
#include <math.h>
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__)
#include <alloca.h>
#endif
// TODO: Calculate this constant from the vocabulary // TODO: Calculate this constant from the vocabulary
#define MAX_TOKEN_LEN 18 #define MAX_TOKEN_LEN 18

View file

@ -15,6 +15,4 @@
#include "llama.h" #include "llama.h"
#include "ggml.h" #include "ggml.h"
std::vector<llama_token> legacy_llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); std::vector<llama_token> legacy_llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);

View file

@ -1 +1 @@
pyinstaller --noconfirm --onefile --clean --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." "./koboldcpp.py" -n "koboldcpp.exe" pyinstaller --noconfirm --onefile --clean --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." "./koboldcpp.py" -n "koboldcpp.exe"

View file

@ -143,11 +143,6 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
ctx_size += (6 + 12*n_layer)*256; // object overhead ctx_size += (6 + 12*n_layer)*256; // object overhead
// if(wtype==GGML_TYPE_Q4_0 || wtype==GGML_TYPE_Q4_1)
// {
// //quantized needs more context
// ctx_size = (ctx_size*4);
// }
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
} }
@ -157,6 +152,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
struct ggml_init_params params = { struct ggml_init_params params = {
.mem_size = ctx_size, .mem_size = ctx_size,
.mem_buffer = NULL, .mem_buffer = NULL,
.no_alloc = false,
}; };
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
@ -273,9 +269,11 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
} }
int32_t nelements = 1; int32_t nelements = 1;
int32_t ne[2] = { 1, 1 }; int64_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) { for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i])); int32_t ne_cur;
fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
ne[i] = ne_cur;
nelements *= ne[i]; nelements *= ne[i];
} }
@ -294,14 +292,14 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
} }
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n",
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
return ModelLoadResult::FAIL; return ModelLoadResult::FAIL;
} }
if (0) { if (0) {
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
} }
size_t bpe = 0; size_t bpe = 0;
@ -392,6 +390,7 @@ bool gpt2_eval(
struct ggml_init_params params = { struct ggml_init_params params = {
.mem_size = buf_size, .mem_size = buf_size,
.mem_buffer = buf, .mem_buffer = buf,
.no_alloc = false,
}; };
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
@ -662,153 +661,4 @@ bool gpt2_eval(
ggml_free(ctx0); ggml_free(ctx0);
return true; return true;
} }
// int main(int argc, char ** argv) {
// ggml_time_init();
// const int64_t t_main_start_us = ggml_time_us();
// gpt_params params;
// params.model = "models/gpt-2-117M/ggml-model.bin";
// if (utils_gpt_params_parse(argc, argv, params) == false) {
// return 1;
// }
// if (params.seed < 0) {
// params.seed = time(NULL);
// }
// printf("%s: seed = %d\n", __func__, params.seed);
// std::mt19937 rng(params.seed);
// if (params.prompt.empty()) {
// if( !isatty(STDIN_FILENO) ){
// std::string line;
// while( std::getline(std::cin, line) ){
// params.prompt = params.prompt + "\n" + line;
// }
// } else {
// params.prompt = utils_gpt_random_prompt(rng);
// }
// }
// int64_t t_load_us = 0;
// gpt_vocab vocab;
// gpt2_model model;
// // load the model
// {
// const int64_t t_start_us = ggml_time_us();
// if (!gpt2_model_load(params.model, model, vocab)) {
// fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
// return 1;
// }
// t_load_us = ggml_time_us() - t_start_us;
// }
// int n_past = 0;
// int64_t t_sample_us = 0;
// int64_t t_predict_us = 0;
// std::vector<float> logits;
// // tokenize the prompt
// std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
// params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
// printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
// printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
// for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
// printf("%d ", embd_inp[i]);
// }
// printf("\n\n");
// // submit the input prompt token-by-token
// // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
// std::vector<gpt_vocab::id> embd;
// // determine the required inference memory per token:
// size_t mem_per_token = 0;
// gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
// for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
// // predict
// if (embd.size() > 0) {
// const int64_t t_start_us = ggml_time_us();
// if (!gpt2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
// printf("Failed to predict\n");
// return 1;
// }
// t_predict_us += ggml_time_us() - t_start_us;
// }
// n_past += embd.size();
// embd.clear();
// if (i >= embd_inp.size()) {
// // sample next token
// const int top_k = params.top_k;
// const float top_p = params.top_p;
// const float temp = params.temp;
// const int n_vocab = model.hparams.n_vocab;
// gpt_vocab::id id = 0;
// {
// const int64_t t_start_sample_us = ggml_time_us();
// id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
// t_sample_us += ggml_time_us() - t_start_sample_us;
// }
// // add it to the context
// embd.push_back(id);
// } else {
// // if here, it means we are still processing the input prompt
// for (int k = i; k < embd_inp.size(); k++) {
// embd.push_back(embd_inp[k]);
// if (embd.size() >= params.n_batch) {
// break;
// }
// }
// i += embd.size() - 1;
// }
// // display text
// for (auto id : embd) {
// printf("%s", vocab.id_to_token[id].c_str());
// }
// fflush(stdout);
// // end of text token
// if (embd.back() == 50256) {
// break;
// }
// }
// // report timing
// {
// const int64_t t_main_end_us = ggml_time_us();
// printf("\n\n");
// printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
// printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
// printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
// printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
// printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
// }
// ggml_free(model.ctx);
// return 0;
// }

View file

@ -151,6 +151,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
struct ggml_init_params params = { struct ggml_init_params params = {
.mem_size = ctx_size, .mem_size = ctx_size,
.mem_buffer = NULL, .mem_buffer = NULL,
.no_alloc = false,
}; };
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
@ -263,10 +264,12 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
break; break;
} }
int32_t nelements = 1; int64_t nelements = 1;
int32_t ne[2] = { 1, 1 }; int64_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) { for (int i = 0; i < n_dims; ++i) {
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i])); int32_t ne_cur;
fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
ne[i] = ne_cur;
nelements *= ne[i]; nelements *= ne[i];
} }
@ -305,7 +308,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
if (0) { if (0) {
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
} }
size_t bpe = 0; size_t bpe = 0;
@ -329,7 +332,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
} }
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor)); fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
//printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
total_size += ggml_nbytes(tensor); total_size += ggml_nbytes(tensor);
if (++n_tensors % 8 == 0) { if (++n_tensors % 8 == 0) {
@ -398,6 +401,7 @@ bool gptj_eval(
struct ggml_init_params params = { struct ggml_init_params params = {
.mem_size = buf_size, .mem_size = buf_size,
.mem_buffer = buf, .mem_buffer = buf,
.no_alloc = false,
}; };
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
@ -428,14 +432,17 @@ bool gptj_eval(
// self-attention // self-attention
{ {
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur); struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur); struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur);
// store key and value to memory // store key and value to memory
if (N >= 1) { {
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur));
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
( n_ctx)*ggml_element_size(model.memory_v),
(il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
@ -444,21 +451,15 @@ bool gptj_eval(
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
struct ggml_tensor * Q = struct ggml_tensor * Q =
ggml_permute(ctx0, ggml_permute(ctx0,
ggml_rope(ctx0, Qcur,
ggml_cpy(ctx0,
Qcur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
n_past, n_rot, 0),
0, 2, 1, 3); 0, 2, 1, 3);
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
struct ggml_tensor * K = struct ggml_tensor * K =
ggml_permute(ctx0, ggml_permute(ctx0,
ggml_rope(ctx0, ggml_reshape_3d(ctx0,
ggml_reshape_3d(ctx0, ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), n_embd/n_head, n_head, n_past + N),
n_embd/n_head, n_head, n_past + N),
n_past, n_rot, 1),
0, 2, 1, 3); 0, 2, 1, 3);
// K * Q // K * Q
@ -478,17 +479,15 @@ bool gptj_eval(
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
struct ggml_tensor * V_trans = struct ggml_tensor * V =
ggml_cpy(ctx0, ggml_view_3d(ctx0, model.memory_v,
ggml_permute(ctx0, n_past + N, n_embd/n_head, n_head,
ggml_reshape_3d(ctx0, n_ctx*ggml_element_size(model.memory_v),
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
n_embd/n_head, n_head, n_past + N), il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
// KQV = transpose(V) * KQ_soft_max // KQV = transpose(V) * KQ_soft_max
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
// KQV_merged = KQV.permute(0, 2, 1, 3) // KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@ -587,4 +586,4 @@ bool gptj_eval(
ggml_free(ctx0); ggml_free(ctx0);
return true; return true;
} }