Minor fixes

This commit is contained in:
Faisal Zaghloul 2024-07-31 12:42:30 -04:00 committed by fmz
parent 130adf8415
commit a0aae528bb
5 changed files with 18 additions and 6 deletions

View file

@ -50,6 +50,6 @@ else()
endif() endif()
add_subdirectory(save-load-state) add_subdirectory(save-load-state)
add_subdirectory(simple) add_subdirectory(simple)
#add_subdirectory(speculative) add_subdirectory(speculative)
add_subdirectory(tokenize) add_subdirectory(tokenize)
endif() endif()

View file

@ -73,10 +73,11 @@ int main(int argc, char ** argv) {
// load the draft model // load the draft model
params.model = params.model_draft; params.model = params.model_draft;
params.n_gpu_layers = params.n_gpu_layers_draft; params.n_gpu_layers = params.n_gpu_layers_draft;
if (params.n_threads_draft > 0) { if (params.draft_cpuparams.n_threads > 0) {
params.n_threads = params.n_threads_draft; params.cpuparams.n_threads = params.draft_cpuparams.n_threads;
} }
params.n_threads_batch = params.n_threads_batch_draft;
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
llama_init_result llama_init_dft = llama_init_from_gpt_params(params); llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
model_dft = llama_init_dft.model; model_dft = llama_init_dft.model;
ctx_dft = llama_init_dft.context; ctx_dft = llama_init_dft.context;

View file

@ -18737,7 +18737,7 @@ static bool __thread_affinity(const bool * mask) {
for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) { for (uint32_t i = 0; i < GGML_MAX_N_THREADS; i++) {
if (mask[i]) { if (mask[i]) {
printf("Thread %lx: adding %d to cpuset\n", pthread_self(), i); GGML_PRINT_DEBUG("Thread %lx: adding %d to cpuset\n", pthread_self(), i);
CPU_SET(i, &cpuset); CPU_SET(i, &cpuset);
} }
} }
@ -19130,7 +19130,7 @@ static bool ggml_graph_compute_check_for_work(struct ggml_compute_state * state)
if (threadpool->poll) { if (threadpool->poll) {
while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) { while (!threadpool->new_work && !threadpool->stop && !threadpool->pause) {
// No new work. Yield and keep polling. // No new work. Yield and keep polling.
//__cpu_relax(); __cpu_relax();
} }
} else { } else {
ggml_mutex_lock_shared(&threadpool->mutex); ggml_mutex_lock_shared(&threadpool->mutex);

View file

@ -439,6 +439,8 @@ extern "C" {
LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx); LLAMA_API void llama_detach_batch_threadpool(struct llama_context * ctx);
LLAMA_API void llama_detach_threadpools(struct llama_context * ctx); LLAMA_API void llama_detach_threadpools(struct llama_context * ctx);
// Pauses all attached threadpools
LLAMA_API void llama_pause_threadpools(struct llama_context * ctx);
// Call once at the end of the program - currently only used for MPI // Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void); LLAMA_API void llama_backend_free(void);

View file

@ -17526,6 +17526,15 @@ void llama_detach_threadpools(struct llama_context * ctx) {
llama_detach_batch_threadpool(ctx); llama_detach_batch_threadpool(ctx);
} }
void llama_pause_threadpools(struct llama_context * ctx) {
if (ctx->threadpool) {
ggml_pause_threadpool(ctx->threadpool);
}
if (ctx->threadpool_batch) {
ggml_pause_threadpool(ctx->threadpool_batch);
}
}
void llama_backend_free(void) { void llama_backend_free(void) {
ggml_quantize_free(); ggml_quantize_free();
} }