diff --git a/Makefile b/Makefile index 5a2b3a52e..278ab6361 100644 --- a/Makefile +++ b/Makefile @@ -123,14 +123,16 @@ ifneq ($(filter armv8%,$(UNAME_M)),) CFLAGS += -mfp16-format=ieee -mno-unaligned-access endif -OPENBLAS_BUILD = -CLBLAST_BUILD = -OPENBLAS_NOAVX2_BUILD = +OPENBLAS_BUILD = +CLBLAST_BUILD = +NOAVX2_BUILD = +OPENBLAS_NOAVX2_BUILD = ifeq ($(OS),Windows_NT) OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas.dll $(LDFLAGS) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/OpenCL.lib lib/clblast.lib -shared -o koboldcpp_clblast.dll $(LDFLAGS) - OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas_noavx2.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas_noavx2.dll $(LDFLAGS) + OPENBLAS_NOAVX2_BUILD = $(CXX) $(CXXFLAGS) ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o lib/libopenblas.lib -shared -o koboldcpp_openblas_noavx2.dll $(LDFLAGS) + NOAVX2_BUILD = $(CXX) $(CXXFLAGS) ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o -shared -o koboldcpp_noavx2.dll $(LDFLAGS) else ifndef LLAMA_OPENBLAS ifndef LLAMA_CLBLAST @@ -154,7 +156,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: llamalib llamalib_openblas llamalib_openblas_noavx2 llamalib_clblast +default: llamalib llamalib_noavx2 llamalib_openblas llamalib_openblas_noavx2 llamalib_clblast # # Build library @@ -166,6 +168,9 @@ ggml.o: ggml.c ggml.h ggml_openblas.o: ggml.c ggml.h $(CC) $(CFLAGS) $(BONUSCFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas.o +ggml_noavx2.o: ggml.c ggml.h + $(CC) $(CFLAGS) -c ggml.c -o ggml_noavx2.o + ggml_openblas_noavx2.o: ggml.c ggml.h $(CC) $(CFLAGS) -DGGML_USE_OPENBLAS -c ggml.c -o ggml_openblas_noavx2.o @@ -176,7 +181,7 @@ ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h $(CC) $(CFLAGS) $(BONUSCFLAGS) -c otherarch/ggml_v1.c -o ggml_v1.o ggml_v1_noavx2.o: otherarch/ggml_v1.c otherarch/ggml_v1.h - $(CC) $(CFLAGS) -c otherarch/ggml_v1.c -o ggml_v1.o + $(CC) $(CFLAGS) -c otherarch/ggml_v1.c -o ggml_v1_noavx2.o llama.o: llama.cpp llama.h llama_internal.h $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o @@ -194,7 +199,7 @@ gpttype_adapter.o: $(CXX) $(CXXFLAGS) -c gpttype_adapter.cpp -o gpttype_adapter.o clean: - rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize-stats perplexity embedding main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll gptj.exe gpt2.exe + rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize-stats perplexity embedding main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe koboldcpp.dll koboldcpp_openblas.dll koboldcpp_noavx2.dll koboldcpp_openblas_noavx2.dll koboldcpp_clblast.dll gptj.exe gpt2.exe main: examples/main/main.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @@ -208,6 +213,9 @@ llamalib: ggml.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o llamalib_openblas: ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o $(OPENBLAS_BUILD) +llamalib_noavx2: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o + $(NOAVX2_BUILD) + llamalib_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o $(OPENBLAS_NOAVX2_BUILD) diff --git a/expose.cpp b/expose.cpp index 6893882da..893f427a0 100644 --- a/expose.cpp +++ b/expose.cpp @@ -31,9 +31,15 @@ extern "C" std::string model = inputs.model_filename; file_format = check_file_format(model.c_str()); - //first digit is platform, second is devices - int platform = inputs.clblast_info/10; - int devices = inputs.clblast_info%10; + //first digit is whether configured, second is platform, third is devices + int parseinfo = inputs.clblast_info; + + std::string usingclblast = "KCPP_CLBLAST_CONFIGURED="+std::to_string(parseinfo>0?1:0); + putenv((char*)usingclblast.c_str()); + + parseinfo = parseinfo%100; //keep last 2 digits + int platform = parseinfo/10; + int devices = parseinfo%10; std::string platformenv = "KCPP_CLBLAST_PLATFORM="+std::to_string(platform); std::string deviceenv = "KCPP_CLBLAST_DEVICES="+std::to_string(devices); putenv((char*)platformenv.c_str()); diff --git a/koboldcpp.py b/koboldcpp.py index 187e8279b..7301eb289 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -42,13 +42,17 @@ def init_library(): global handle, use_blas, use_clblast, use_noavx2 libname = "" if use_noavx2: - libname = "koboldcpp_openblas_noavx2.dll" - elif use_blas: - libname = "koboldcpp_openblas.dll" - elif use_clblast: - libname = "koboldcpp_clblast.dll" + if use_blas: + libname = "koboldcpp_openblas_noavx2.dll" + else: + libname = "koboldcpp_noavx2.dll" else: - libname = "koboldcpp.dll" + if use_clblast: + libname = "koboldcpp_clblast.dll" + elif use_blas: + libname = "koboldcpp_openblas.dll" + else: + libname = "koboldcpp.dll" print("Initializing dynamic library: " + libname) dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -72,7 +76,7 @@ def load_model(model_filename,batch_size=8,max_context_length=512,n_parts_overwr inputs.use_mmap = use_mmap clblastids = 0 if args.useclblast: - clblastids = int(args.useclblast[0])*10 + int(args.useclblast[1]) + clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1]) inputs.clblast_info = clblastids ret = handle.load_model(inputs) return ret @@ -313,30 +317,36 @@ def RunServerMultiThreaded(addr, port, embedded_kailite = None): def main(args): global use_blas, use_clblast, use_noavx2 - if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas.dll")): - print("Warning: libopenblas.dll or koboldcpp_openblas.dll not found. Non-BLAS library will be used. Ignore this if you have manually linked with OpenBLAS.") - use_blas = False - elif os.name != 'nt': - print("Prebuilt OpenBLAS binaries only available for windows. Please manually build/link libopenblas from makefile with LLAMA_OPENBLAS=1") - use_blas = False + use_blas = False + use_clblast = False + use_noavx2 = False + + if os.name != 'nt': + print("You are not on Windows. Default koboldcpp.dll library file will be used. Remember to manually link with OpenBLAS using LLAMA_OPENBLAS=1, or CLBlast with LLAMA_CLBLAST=1 if you want to use them.") + elif args.noavx2: + use_noavx2 = True + if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas_noavx2.dll")): + print("Warning: libopenblas.dll or koboldcpp_openblas_noavx2.dll not found. Non-BLAS library will be used.") + elif args.noblas: + print("Attempting to use non-avx2 compatibility library without OpenBLAS.") + else: + use_blas = True + print("Attempting to use non-avx2 compatibility library with OpenBLAS.") elif args.useclblast: if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "clblast.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_clblast.dll")): print("Warning: clblast.dll or koboldcpp_clblast.dll not found. Non-BLAS library will be used. Ignore this if you have manually linked with CLBlast.") else: print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast.dll will be required.") use_clblast = True - elif args.noavx2: - if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas_noavx2.dll")): - print("Warning: libopenblas.dll or koboldcpp_openblas_noavx2.dll not found. This mode cannot be used.") - elif os.name == 'nt': - print("Attempting to use non-avx2 compatibility openblas library.") - use_noavx2 = True + else: + if not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "libopenblas.dll")) or not os.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)), "koboldcpp_openblas.dll")): + print("Warning: libopenblas.dll or koboldcpp_openblas.dll not found. Non-BLAS library will be used.") + elif args.noblas: + print("Attempting to library without OpenBLAS.") else: - print("Non-AVX2 compatibility OpenBLAS mode only available on windows. On other OS, please manually rebuild without AVX2 flags.") - elif not args.noblas: - print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas.dll will be required.") - use_blas = True - + use_blas = True + print("Attempting to use OpenBLAS library for faster prompt ingestion. A compatible libopenblas.dll will be required.") + if args.psutil_set_threads: import psutil args.threads = psutil.cpu_count(logical=False) @@ -421,9 +431,9 @@ if __name__ == '__main__': parser.add_argument("--psutil_set_threads", help="Experimental flag. If set, uses psutils to determine thread count based on physical cores.", action='store_true') parser.add_argument("--stream", help="Uses pseudo streaming", action='store_true') parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true') + parser.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --clblast.", action='store_true') compatgroup = parser.add_mutually_exclusive_group() compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true') - compatgroup.add_argument("--noavx2", help="Do not use AVX2 instructions, a slower compatibility mode for older devices. Does not work with --noblas or --clblast.", action='store_true') compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) args = parser.parse_args() main(args) diff --git a/llama.cpp b/llama.cpp index 752795aef..88ca5e223 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1949,4 +1949,4 @@ const char * llama_print_system_info(void) { // For internal test use std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { return ctx->model.tensors_by_name; -} +} \ No newline at end of file diff --git a/llama_adapter.cpp b/llama_adapter.cpp index 647eb91a6..93f46fae6 100644 --- a/llama_adapter.cpp +++ b/llama_adapter.cpp @@ -7,6 +7,11 @@ //No dynamic memory allocation! Setup structs with FIXED (known) shapes and sizes for ALL output fields //Python will ALWAYS provide the memory, we just write to it. +// Defines sigaction on msys: +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + #include #include "./examples/main/main.cpp" #include "ggml.h" @@ -38,11 +43,12 @@ bool llama_load_model(const load_model_inputs inputs, FileFormat in_file_format) modelname = inputs.model_filename; ctx_params.n_ctx = inputs.max_context_length; - ctx_params.n_parts = inputs.n_parts_overwrite; + ctx_params.n_parts = -1;//inputs.n_parts_overwrite; ctx_params.seed = -1; ctx_params.f16_kv = inputs.f16_kv; ctx_params.logits_all = false; ctx_params.use_mmap = inputs.use_mmap; + ctx_params.use_mlock = false; file_format = in_file_format; diff --git a/llamaextra.cpp b/llamaextra.cpp index 41b511691..9b4be8c9a 100644 --- a/llamaextra.cpp +++ b/llamaextra.cpp @@ -2,22 +2,6 @@ #include "llamaextra.h" #include "llama.cpp" -#include -#include -#include -#include -#include -#include -#include -#include -#include - - #if defined(_MSC_VER) || defined(__MINGW32__) - #include // using malloc.h with MSC/MINGW - #elif !defined(__FreeBSD__) && !defined(__NetBSD__) - #include - #endif - // TODO: Calculate this constant from the vocabulary #define MAX_TOKEN_LEN 18 diff --git a/llamaextra.h b/llamaextra.h index 0098ed76e..ab36c39fd 100644 --- a/llamaextra.h +++ b/llamaextra.h @@ -15,6 +15,4 @@ #include "llama.h" #include "ggml.h" - - std::vector legacy_llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); \ No newline at end of file diff --git a/make_pyinstaller.bat b/make_pyinstaller.bat index e065fdd42..6b8267fbe 100644 --- a/make_pyinstaller.bat +++ b/make_pyinstaller.bat @@ -1 +1 @@ -pyinstaller --noconfirm --onefile --clean --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file +pyinstaller --noconfirm --onefile --clean --console --icon "./niko.ico" --add-data "./klite.embd;." --add-data "./koboldcpp.dll;." --add-data "./koboldcpp_openblas.dll;." --add-data "./koboldcpp_noavx2.dll;." --add-data "./koboldcpp_openblas_noavx2.dll;." --add-data "./libopenblas.dll;." --add-data "./koboldcpp_clblast.dll;." --add-data "./clblast.dll;." "./koboldcpp.py" -n "koboldcpp.exe" \ No newline at end of file diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index b24dfd36d..5e3d625c4 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -143,11 +143,6 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g ctx_size += (6 + 12*n_layer)*256; // object overhead - // if(wtype==GGML_TYPE_Q4_0 || wtype==GGML_TYPE_Q4_1) - // { - // //quantized needs more context - // ctx_size = (ctx_size*4); - // } printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } @@ -157,6 +152,7 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g struct ggml_init_params params = { .mem_size = ctx_size, .mem_buffer = NULL, + .no_alloc = false, }; model.ctx = ggml_init(params); @@ -273,9 +269,11 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g } int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; + int64_t ne[2] = { 1, 1 }; for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + int32_t ne_cur; + fin.read(reinterpret_cast(&ne_cur), sizeof(ne_cur)); + ne[i] = ne_cur; nelements *= ne[i]; } @@ -294,14 +292,14 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g } if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n", __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); return ModelLoadResult::FAIL; } if (0) { static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); } size_t bpe = 0; @@ -392,6 +390,7 @@ bool gpt2_eval( struct ggml_init_params params = { .mem_size = buf_size, .mem_buffer = buf, + .no_alloc = false, }; struct ggml_context * ctx0 = ggml_init(params); @@ -662,153 +661,4 @@ bool gpt2_eval( ggml_free(ctx0); return true; -} - -// int main(int argc, char ** argv) { -// ggml_time_init(); -// const int64_t t_main_start_us = ggml_time_us(); - -// gpt_params params; -// params.model = "models/gpt-2-117M/ggml-model.bin"; - -// if (utils_gpt_params_parse(argc, argv, params) == false) { -// return 1; -// } - -// if (params.seed < 0) { -// params.seed = time(NULL); -// } - -// printf("%s: seed = %d\n", __func__, params.seed); - -// std::mt19937 rng(params.seed); -// if (params.prompt.empty()) { -// if( !isatty(STDIN_FILENO) ){ -// std::string line; -// while( std::getline(std::cin, line) ){ -// params.prompt = params.prompt + "\n" + line; -// } -// } else { -// params.prompt = utils_gpt_random_prompt(rng); -// } -// } - -// int64_t t_load_us = 0; - -// gpt_vocab vocab; -// gpt2_model model; - -// // load the model -// { -// const int64_t t_start_us = ggml_time_us(); - -// if (!gpt2_model_load(params.model, model, vocab)) { -// fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); -// return 1; -// } - -// t_load_us = ggml_time_us() - t_start_us; -// } - -// int n_past = 0; - -// int64_t t_sample_us = 0; -// int64_t t_predict_us = 0; - -// std::vector logits; - -// // tokenize the prompt -// std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - -// params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - -// printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); -// printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); -// for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { -// printf("%d ", embd_inp[i]); -// } -// printf("\n\n"); - -// // submit the input prompt token-by-token -// // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning -// std::vector embd; - -// // determine the required inference memory per token: -// size_t mem_per_token = 0; -// gpt2_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); - -// for (int i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { -// // predict -// if (embd.size() > 0) { -// const int64_t t_start_us = ggml_time_us(); - -// if (!gpt2_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) { -// printf("Failed to predict\n"); -// return 1; -// } - -// t_predict_us += ggml_time_us() - t_start_us; -// } - -// n_past += embd.size(); -// embd.clear(); - -// if (i >= embd_inp.size()) { -// // sample next token -// const int top_k = params.top_k; -// const float top_p = params.top_p; -// const float temp = params.temp; - -// const int n_vocab = model.hparams.n_vocab; - -// gpt_vocab::id id = 0; - -// { -// const int64_t t_start_sample_us = ggml_time_us(); - -// id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); - -// t_sample_us += ggml_time_us() - t_start_sample_us; -// } - -// // add it to the context -// embd.push_back(id); -// } else { -// // if here, it means we are still processing the input prompt -// for (int k = i; k < embd_inp.size(); k++) { -// embd.push_back(embd_inp[k]); -// if (embd.size() >= params.n_batch) { -// break; -// } -// } -// i += embd.size() - 1; -// } - -// // display text -// for (auto id : embd) { -// printf("%s", vocab.id_to_token[id].c_str()); -// } -// fflush(stdout); - -// // end of text token -// if (embd.back() == 50256) { -// break; -// } -// } - -// // report timing -// { -// const int64_t t_main_end_us = ggml_time_us(); - -// printf("\n\n"); -// printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token); -// printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); -// printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); -// printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); -// printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); -// } - -// ggml_free(model.ctx); - -// return 0; -// } \ No newline at end of file +} \ No newline at end of file diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index 484ebf5ab..f02d9cfcd 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -151,6 +151,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g struct ggml_init_params params = { .mem_size = ctx_size, .mem_buffer = NULL, + .no_alloc = false, }; model.ctx = ggml_init(params); @@ -263,10 +264,12 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g break; } - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; + int64_t nelements = 1; + int64_t ne[2] = { 1, 1 }; for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + int32_t ne_cur; + fin.read(reinterpret_cast(&ne_cur), sizeof(ne_cur)); + ne[i] = ne_cur; nelements *= ne[i]; } @@ -305,7 +308,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g if (0) { static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); } size_t bpe = 0; @@ -329,7 +332,7 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g } fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - + //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); total_size += ggml_nbytes(tensor); if (++n_tensors % 8 == 0) { @@ -398,6 +401,7 @@ bool gptj_eval( struct ggml_init_params params = { .mem_size = buf_size, .mem_buffer = buf, + .no_alloc = false, }; struct ggml_context * ctx0 = ggml_init(params); @@ -428,14 +432,17 @@ bool gptj_eval( // self-attention { - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur); - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur); + struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); // store key and value to memory - if (N >= 1) { + { + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur)); + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, + ( n_ctx)*ggml_element_size(model.memory_v), + (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); @@ -444,21 +451,15 @@ bool gptj_eval( // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) struct ggml_tensor * Q = ggml_permute(ctx0, - ggml_rope(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), - n_past, n_rot, 0), + Qcur, 0, 2, 1, 3); // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) struct ggml_tensor * K = ggml_permute(ctx0, - ggml_rope(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), - n_embd/n_head, n_head, n_past + N), - n_past, n_rot, 1), + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); // K * Q @@ -478,17 +479,15 @@ bool gptj_eval( struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - struct ggml_tensor * V_trans = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - n_embd/n_head, n_head, n_past + N), - 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); + struct ggml_tensor * V = + ggml_view_3d(ctx0, model.memory_v, + n_past + N, n_embd/n_head, n_head, + n_ctx*ggml_element_size(model.memory_v), + n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head, + il*n_ctx*ggml_element_size(model.memory_v)*n_embd); // KQV = transpose(V) * KQ_soft_max - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); // KQV_merged = KQV.permute(0, 2, 1, 3) struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); @@ -587,4 +586,4 @@ bool gptj_eval( ggml_free(ctx0); return true; -} +} \ No newline at end of file