speculative : threading options (#4959)

* speculative: expose draft threading

* fix usage format

* accept -td and -tbd args

* speculative: revert default behavior when -td is unspecified

* fix trailing whitespace
This commit is contained in:
stduhpf 2024-01-16 12:04:32 +01:00 committed by GitHub
parent 3e5ca7931c
commit e0324285a5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 28 additions and 0 deletions

View file

@ -65,6 +65,10 @@ int main(int argc, char ** argv) {
// load the draft model
params.model = params.model_draft;
params.n_gpu_layers = params.n_gpu_layers_draft;
if (params.n_threads_draft > 0) {
params.n_threads = params.n_threads_draft;
}
params.n_threads_batch = params.n_threads_batch_draft;
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
{