diff --git a/Makefile b/Makefile index 5e9a8d226..1a8574d08 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,7 @@ endif ifndef ARCH_LINUX ARCH_LINUX := $(shell grep "Arch Linux" /etc/os-release 2>/dev/null) +ARCH_LIKE := $(shell grep "ID_LIKE=arch" /etc/os-release 2>/dev/null) endif CCV := $(shell $(CC) --version | head -n 1) @@ -52,10 +53,15 @@ CXXFLAGS += -pthread -s -Wno-multichar ifeq ($(UNAME_S),Linux) CFLAGS += -pthread CXXFLAGS += -pthread -ifdef ARCH_LINUX - LDFLAGS += -lcblas -endif + ifdef ARCH_LINUX + LDFLAGS += -lcblas + else + ifdef ARCH_LIKE + LDFLAGS += -lcblas + endif + endif endif + ifeq ($(UNAME_S),Darwin) CFLAGS += -pthread CXXFLAGS += -pthread @@ -117,7 +123,7 @@ ifdef LLAMA_OPENBLAS endif ifdef LLAMA_CLBLAST CFLAGS += -DGGML_USE_CLBLAST -DGGML_USE_OPENBLAS - LDFLAGS += -lclblast -lOpenCL + LDFLAGS += -lclblast -lOpenCL -lopenblas endif ifdef LLAMA_GPROF CFLAGS += -pg @@ -202,6 +208,9 @@ ggml_v1.o: otherarch/ggml_v1.c otherarch/ggml_v1.h ggml_v1_noavx2.o: otherarch/ggml_v1.c otherarch/ggml_v1.h $(CC) $(CFLAGS) $(BONUSCFLAGS1) -c $< -o $@ +ggml_rwkv.o: otherarch/ggml_rwkv.c otherarch/ggml_rwkv.h + $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@ + llama.o: llama.cpp llama.h llama_util.h $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -226,19 +235,19 @@ main: examples/main/main.cpp ggml.o llama.o common.o @echo '==== Run ./main -h for help. ====' @echo -koboldcpp.dll: ggml.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o +koboldcpp.dll: ggml.o ggml_rwkv.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o $(CXX) $(CXXFLAGS) $^ -shared -o $@ $(LDFLAGS) -koboldcpp_openblas.dll: ggml_openblas.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o +koboldcpp_openblas.dll: ggml_openblas.o ggml_rwkv.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o $(OPENBLAS_BUILD) -koboldcpp_noavx2.dll: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o +koboldcpp_noavx2.dll: ggml_noavx2.o ggml_rwkv.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o $(NOAVX2_BUILD) -koboldcpp_openblas_noavx2.dll: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o +koboldcpp_openblas_noavx2.dll: ggml_openblas_noavx2.o ggml_rwkv.o ggml_v1_noavx2.o expose.o common.o llama_adapter.o gpttype_adapter.o $(OPENBLAS_NOAVX2_BUILD) -koboldcpp_clblast.dll: ggml_clblast.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o +koboldcpp_clblast.dll: ggml_clblast.o ggml_rwkv.o ggml_v1.o expose.o common.o llama_adapter.o gpttype_adapter.o $(CLBLAST_BUILD) quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o @@ -247,10 +256,10 @@ quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -quantize_gptj: ggml.o llama.o otherarch/gptj_quantize.cpp +quantize_gptj: ggml.o llama.o otherarch/tools/gptj_quantize.cpp $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -quantize_gpt2: ggml.o llama.o otherarch/gpt2_quantize.cpp +quantize_gpt2: ggml.o llama.o otherarch/tools/gpt2_quantize.cpp $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o diff --git a/expose.cpp b/expose.cpp index d8b62b3e1..59086d7a2 100644 --- a/expose.cpp +++ b/expose.cpp @@ -53,16 +53,23 @@ extern "C" ModelLoadResult lr = gpttype_load_model(inputs, file_format); if (lr == ModelLoadResult::RETRY_LOAD) { - file_format = FileFormat::GPTJ_2; - printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); - lr = gpttype_load_model(inputs, file_format); - } - if (lr == ModelLoadResult::RETRY_LOAD) - { - file_format = FileFormat::GPTJ_3; - printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); - lr = gpttype_load_model(inputs, file_format); - } + if(file_format==FileFormat::GPTJ_1) + { + //if we tried 1 first, then try 3 and lastly 2 + //otherwise if we tried 3 first, then try 2 + file_format = FileFormat::GPTJ_3; + printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); + lr = gpttype_load_model(inputs, file_format); + } + + //lastly try format 2 + if (lr == ModelLoadResult::RETRY_LOAD) + { + file_format = FileFormat::GPTJ_2; + printf("\n---\nRetrying as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); + lr = gpttype_load_model(inputs, file_format); + } + } if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD) { @@ -92,6 +99,19 @@ extern "C" return true; } } + else if(file_format==FileFormat::RWKV_1) + { + printf("\n---\nIdentified as RWKV model: (ver %d)\nAttempting to Load...\n---\n", file_format); + ModelLoadResult lr = gpttype_load_model(inputs, file_format); + if (lr == ModelLoadResult::FAIL || lr == ModelLoadResult::RETRY_LOAD) + { + return false; + } + else + { + return true; + } + } else { printf("\n---\nIdentified as LLAMA model: (ver %d)\nAttempting to Load...\n---\n", file_format); @@ -102,7 +122,7 @@ extern "C" generation_outputs generate(const generation_inputs inputs, generation_outputs &output) { if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 - || file_format==FileFormat::GPT2_1 || file_format==FileFormat::GPT2_2 ) + || file_format==FileFormat::GPT2_1 || file_format==FileFormat::GPT2_2 || file_format==FileFormat::RWKV_1) { return gpttype_generate(inputs, output); } diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index dc4b37265..9ce3cf37f 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -17,6 +17,7 @@ #include "otherarch/gptj_v2.cpp" #include "otherarch/gpt2_v1.cpp" #include "otherarch/gpt2_v2.cpp" +#include "otherarch/rwkv.cpp" //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt) static FileFormat file_format = FileFormat::BADFORMAT; @@ -25,6 +26,7 @@ static gptj_model_v1 model_v1; static gptj_model model_v2; static gpt2_v1_model model_gpt2_v1; static gpt2_model model_gpt2_v2; +static rwkv_context * rwkv_context_v1; static gpt_params params; static int n_past = 0; static int n_threads = 4; @@ -59,7 +61,45 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in params.n_ctx = inputs.max_context_length; model_v1.hparams.n_ctx = model_v2.hparams.n_ctx = model_gpt2_v1.hparams.n_ctx = model_gpt2_v2.hparams.n_ctx = params.n_ctx; - if (file_format == FileFormat::GPT2_1) + if (file_format == FileFormat::RWKV_1) + { + rwkv_context_v1 = rwkv_init_from_file(modelname.c_str(), n_threads); + + //setup buffers for rwkv state + auto padding = 512u; + auto statebufsiz = rwkv_get_state_buffer_element_count(rwkv_context_v1) * sizeof(float) + padding; + auto logitbufsiz = rwkv_get_logits_buffer_element_count(rwkv_context_v1) * sizeof(float) + padding; + + printf("\nRWKV Init: State Buffer:%u, Logit Buffer:%u\n", statebufsiz, logitbufsiz); + rwkv_context_v1->state_out = (float *)malloc(statebufsiz); + rwkv_context_v1->logits_out = (float *)malloc(logitbufsiz); + rwkv_context_v1->state_in = nullptr; + n_batch = 1; + + std::string word; + for (int i = 0; i < 20; i++) { + uint32_t len; + word = ('a'+i); + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + + int vocabsiz = vocab.token_to_id.size(); + bool testeval = rwkv_eval(rwkv_context_v1, 0, rwkv_context_v1->state_in, rwkv_context_v1->state_out, rwkv_context_v1->logits_out); + if(!testeval) + { + printf("\nError: RWKV Init Eval Failed!\n"); + } + logits.resize(vocabsiz); + memcpy(logits.data(), rwkv_context_v1->logits_out, sizeof(float)*vocabsiz); + + if (rwkv_context_v1 == NULL) + { + return ModelLoadResult::FAIL; + } + return ModelLoadResult::SUCCESS; + } + else if (file_format == FileFormat::GPT2_1) { ModelLoadResult res = legacy_gpt2_model_load(params.model, model_gpt2_v1, vocab, file_format); if(res==ModelLoadResult::FAIL) @@ -209,7 +249,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); n_past = 0; - ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext); + if(file_format!=FileFormat::RWKV_1) + { + ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, useSmartContext); + } //if using BLAS and prompt is big enough, switch to single thread and use a huge batch bool approved_format = (file_format!=FileFormat::GPT2_1 && file_format!=FileFormat::GPTJ_1 && file_format!=FileFormat::GPTJ_2); @@ -228,6 +271,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o current_context_tokens.resize(n_past); int remaining_tokens = params.n_predict; + int stopper_unused_tokens = 0; int input_consumed = 0; std::mt19937 rng(params.seed); std::string concat_output = ""; @@ -254,12 +298,17 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { n_vocab = model_gpt2_v2.hparams.n_vocab; } + else if(file_format == FileFormat::RWKV_1) + { + n_vocab = vocab.id_to_token.size(); //handled seperately + } else { printf("Bad format!"); } printf("\n"); + while (remaining_tokens > 0) { gpt_vocab::id id = 0; @@ -278,9 +327,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } bool evalres = false; - - //print_tok_vec(logits); - if(file_format==FileFormat::GPT2_1) + + if(file_format==FileFormat::RWKV_1) + { + evalres = rwkv_eval(rwkv_context_v1, embd[0], rwkv_context_v1->state_in, rwkv_context_v1->state_out, rwkv_context_v1->logits_out); + } + else if(file_format==FileFormat::GPT2_1) { evalres = legacy_gpt2_eval(model_gpt2_v1, params.n_threads, n_past, embd, logits, mem_per_token, file_format); } @@ -326,14 +378,14 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } { - // set the logit of the eos token (2) to zero to avoid sampling it - logits[50256] = (logits[50256]<0?logits[50256]:0); - + // set the logit of the eos token (2) to zero to avoid sampling it + if(logits.size()>50256) + { + logits[50256] = (logits[50256]<0?logits[50256]:0); + } //gpt2 uses negative logits, so we cant zero it id = gptj_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng); - - last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.push_back(id); current_context_tokens.push_back(id); @@ -352,6 +404,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { if (concat_output.find(matched) != std::string::npos) { + stopper_unused_tokens = remaining_tokens; remaining_tokens = 0; printf("\n(Stop sequence triggered: <%s>)",matched.c_str()); break; @@ -378,7 +431,8 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } time2 = timer_check(); float pt1 = (time1*1000.0/(embd_inp_size==0?1:embd_inp_size)); - float pt2 = (time2*1000.0/(params.n_predict==0?1:params.n_predict)); + int realnpredict = params.n_predict-stopper_unused_tokens; + float pt2 = (time2*1000.0/(realnpredict==0?1:realnpredict)); printf("\nTime Taken - Processing:%.1fs (%.0fms/T), Generation:%.1fs (%.0fms/T), Total:%.1fs", time1, pt1, time2, pt2, (time1 + time2)); fflush(stdout); output.status = 1; diff --git a/klite.embd b/klite.embd index 31ecc71b4..5d71a229a 100644 --- a/klite.embd +++ b/klite.embd @@ -1,6 +1,6 @@