diff --git a/common/common.h b/common/common.h index 875e012a2..9659aa045 100644 --- a/common/common.h +++ b/common/common.h @@ -51,7 +51,7 @@ struct gpt_params { int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_draft = 16; // number of tokens to draft during speculative decoding + int32_t n_draft = 8; // number of tokens to draft during speculative decoding int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_parallel = 1; // number of parallel sequences to decode int32_t n_sequences = 1; // number of sequences to decode diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index 6b4eb957a..ab1be0a32 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -17,7 +17,7 @@ int main(int argc, char ** argv){ const int max_ngram_size = 3; // length of the candidate / draft sequence, if match is found - const int n_draft = 10; + const int n_draft = params.n_draft; const bool dump_kv_cache = params.dump_kv_cache;