From 3ed3e7b7e2b98cbc867bf42a4599ecf11a03422a Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Wed, 14 Jun 2023 20:03:14 +0800 Subject: [PATCH] reverted sequence mode for rwkv due to multiple issues with speed loss with bigger quantized models --- ggml.h | 2 +- gpttype_adapter.cpp | 2 +- koboldcpp.py | 2 +- otherarch/rwkv_v3.cpp | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml.h b/ggml.h index a20d2cc30..f2a91761b 100644 --- a/ggml.h +++ b/ggml.h @@ -194,7 +194,7 @@ #define GGML_QNT_VERSION_FACTOR 1000 // do not change this #define GGML_MAX_DIMS 4 -#define GGML_MAX_NODES 16384 +#define GGML_MAX_NODES 4096 #define GGML_MAX_PARAMS 256 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_OPT 4 diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 8307e7ec5..775f29bb6 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -479,7 +479,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in } else { - n_batch = 8; //use sequence mode to speedup + n_batch = 1; //do not use sequence mode to speedup until it is fixed //setup buffers for rwkv state auto padding = 512u; diff --git a/koboldcpp.py b/koboldcpp.py index ca576da39..affd4b8df 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -224,7 +224,7 @@ maxctx = 2048 maxlen = 256 modelbusy = False defaultport = 5001 -KcppVersion = "1.30.2" +KcppVersion = "1.30.3" class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sys_version = "" diff --git a/otherarch/rwkv_v3.cpp b/otherarch/rwkv_v3.cpp index 4aa9237d6..0396f9934 100644 --- a/otherarch/rwkv_v3.cpp +++ b/otherarch/rwkv_v3.cpp @@ -484,8 +484,8 @@ struct rwkv_ggml_context { return; } - const size_t memory_required_overhead = size_t(256) * 1024 * 1024; - const size_t memory_required_overhead_sc = size_t(128) * 1024 * 1024; + const size_t memory_required_overhead = size_t(128) * 1024 * 1024; + const size_t memory_required_overhead_sc = size_t(64) * 1024 * 1024; ctx = ggml_init({ size.objects_count * GGML_OBJECT_SIZE + size.objects_size + memory_required_overhead, NULL, false});