diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 6d4746ff4..9cfde8308 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -6,49 +6,11 @@ #include #include -// a function that can be called for every computed node during graph evaluation -// the user can choose to whether to observe the data of the node depending on the tensor parameters -static bool observe_compute(struct ggml_tensor * t, bool ask, void * user_data) { - GGML_UNUSED(user_data); - - // the scheduler is asking us if we want to observe this node - if (ask) { - // check if name contains soft_max (customize to your needs) - return strstr(t->name, "soft_max") != 0; - } - - // print the node info - printf("%s: t->name = %32s, t->op = %12s, [%5d, %5d, %5d, %5d]\n", - __func__, t->name, ggml_op_name(t->op), (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); - - // this will copy the data to host memory (if needed) - static std::vector t_data; - - const bool is_host = ggml_backend_buffer_is_host(t->buffer); - - if (!is_host) { - t_data.resize(ggml_nelements(t)); - ggml_backend_tensor_get(t, t_data.data(), 0, ggml_nbytes(t)); - } - - const float * data = is_host ? (const float *) t->data : t_data.data(); - - // print first row - for (int i = 0; i < t->ne[0]; i++) { - printf("%8.4f ", data[i]); - } - printf("\n"); - - return true; -} - int main(int argc, char ** argv) { gpt_params params; - bool observe = false; - if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [PROMPT] [OBSERV]\n" , argv[0]); + printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]); return 1 ; } @@ -60,10 +22,6 @@ int main(int argc, char ** argv) { params.prompt = argv[2]; } - if (argc >= 4) { - observe = !!atoi(argv[3]); - } - if (params.prompt.empty()) { params.prompt = "Hello my name is"; } @@ -79,7 +37,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); - model_params.n_gpu_layers = 99; // offload all layers to the GPU + // model_params.n_gpu_layers = 99; // offload all layers to the GPU llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -97,9 +55,6 @@ int main(int argc, char ** argv) { ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; - ctx_params.cb_eval = observe ? observe_compute : NULL; - ctx_params.cb_eval_user_data = NULL; - llama_context * ctx = llama_new_context_with_model(model, ctx_params); if (ctx == NULL) { diff --git a/ggml-backend.c b/ggml-backend.c index 970495a4c..8dfbb2af2 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1384,10 +1384,6 @@ static void sched_reset(ggml_backend_sched_t sched) { memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); - // TODO: should we clear the callbacks? - //sched->callback_eval = NULL; - //sched->callback_eval_user_data = NULL; - sched->is_reset = true; }