reverted sequence mode for rwkv due to multiple issues with speed loss with bigger quantized models
This commit is contained in:
parent
f83b66606b
commit
3ed3e7b7e2
4 changed files with 5 additions and 5 deletions
2
ggml.h
2
ggml.h
|
@ -194,7 +194,7 @@
|
|||
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
||||
|
||||
#define GGML_MAX_DIMS 4
|
||||
#define GGML_MAX_NODES 16384
|
||||
#define GGML_MAX_NODES 4096
|
||||
#define GGML_MAX_PARAMS 256
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_OPT 4
|
||||
|
|
|
@ -479,7 +479,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
}
|
||||
else
|
||||
{
|
||||
n_batch = 8; //use sequence mode to speedup
|
||||
n_batch = 1; //do not use sequence mode to speedup until it is fixed
|
||||
|
||||
//setup buffers for rwkv state
|
||||
auto padding = 512u;
|
||||
|
|
|
@ -224,7 +224,7 @@ maxctx = 2048
|
|||
maxlen = 256
|
||||
modelbusy = False
|
||||
defaultport = 5001
|
||||
KcppVersion = "1.30.2"
|
||||
KcppVersion = "1.30.3"
|
||||
|
||||
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||
sys_version = ""
|
||||
|
|
|
@ -484,8 +484,8 @@ struct rwkv_ggml_context {
|
|||
return;
|
||||
}
|
||||
|
||||
const size_t memory_required_overhead = size_t(256) * 1024 * 1024;
|
||||
const size_t memory_required_overhead_sc = size_t(128) * 1024 * 1024;
|
||||
const size_t memory_required_overhead = size_t(128) * 1024 * 1024;
|
||||
const size_t memory_required_overhead_sc = size_t(64) * 1024 * 1024;
|
||||
|
||||
ctx = ggml_init({ size.objects_count * GGML_OBJECT_SIZE + size.objects_size + memory_required_overhead, NULL, false});
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue