diff --git a/llama.cpp b/llama.cpp index 47b4c8dd7..fff90a143 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1194,27 +1194,14 @@ static bool llama_model_load( } } -// evaluate the transformer -// -// - lctx: llama context -// - tokens: new batch of tokens to process -// - n_past: the context size so far -// - n_threads: number of threads to use -// -static bool llama_eval_internal( - llama_context & lctx, - const llama_token * tokens, +static bool llama_eval_internal_tensor( + llama_context& lctx, + ggml_context* ctx0, + ggml_tensor* inpL, const int n_tokens, const int n_past, - const int n_threads) { - - // enforce that the first token is BOS - if (n_past == 0 && tokens[0] != llama_token_bos()) { - fprintf(stderr, "%s: first token must be BOS\n", __func__); - return false; - } - - const int64_t t_start_us = ggml_time_us(); + const int n_threads, + const int64_t t_start_us) { const int N = n_tokens; @@ -1223,8 +1210,6 @@ static bool llama_eval_internal( const auto & kv_self = model.kv_self; - LLAMA_ASSERT(!!kv_self.ctx); - const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; @@ -1233,26 +1218,14 @@ static bool llama_eval_internal( const int n_rot = hparams.n_embd/hparams.n_head; auto & mem_per_token = lctx.mem_per_token; - auto & buf_compute = lctx.buf_compute; - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute.size, - /*.mem_buffer =*/ buf_compute.addr, - /*.no_alloc =*/ false, - }; - - struct ggml_context * ctx0 = ggml_init(params); + LLAMA_ASSERT(!!kv_self.ctx); // for big prompts, if BLAS is enabled, it is better to use only one thread // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance ggml_cgraph gf = {}; gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_set_name(embd, "embd"); - memcpy(embd->data, tokens, N*ggml_element_size(embd)); - - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; @@ -1494,6 +1467,52 @@ static bool llama_eval_internal( return true; } + +// evaluate the transformer +// +// - lctx: llama context +// - tokens: new batch of tokens to process +// - n_past: the context size so far +// - n_threads: number of threads to use +// +static bool llama_eval_internal( + llama_context & lctx, + const llama_token * tokens, + const int n_tokens, + const int n_past, + const int n_threads) { + + // enforce that the first token is BOS + if (n_past == 0 && tokens[0] != llama_token_bos()) { + fprintf(stderr, "%s: first token must be BOS\n", __func__); + return false; + } + + const auto & model = lctx.model; + + const int64_t t_start_us = ggml_time_us(); + + const int N = n_tokens; + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.addr, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + ggml_set_name(embd, "embd"); + memcpy(embd->data, tokens, N*ggml_element_size(embd)); + + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); + return llama_eval_internal_tensor(lctx, ctx0, inpL, N, n_past, n_threads, t_start_us); +} + // // tokenizer // @@ -2214,6 +2233,97 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } + +ggml_tensor *quantize_float_tensor(ggml_context *ctx0, ggml_tensor* tensor, + llama_ftype ftype, int nthread) { + + ggml_type quantized_type; + switch (ftype) { + case LLAMA_FTYPE_MOSTLY_Q4_0: + quantized_type = GGML_TYPE_Q4_0; + break; + case LLAMA_FTYPE_MOSTLY_Q4_1: + quantized_type = GGML_TYPE_Q4_1; + break; + case LLAMA_FTYPE_MOSTLY_Q5_0: + quantized_type = GGML_TYPE_Q5_0; + break; + case LLAMA_FTYPE_MOSTLY_Q5_1: + quantized_type = GGML_TYPE_Q5_1; + break; + case LLAMA_FTYPE_MOSTLY_Q8_0: + quantized_type = GGML_TYPE_Q8_0; + break; + default: + throw format("invalid output file type %d\n", ftype); + }; + void *new_data; + size_t new_size; + llama_buffer work; + float *f32_data; + size_t nelements = tensor->ne[0] * tensor->ne[1]; + llama_buffer f32_conv_buf; + f32_data = (float *)tensor->data; + work.resize(nelements * 4); + new_data = work.addr; + std::vector hist_cur(1 << 4, 0); + std::vector workers; + std::mutex mutex; + enum ggml_type new_type = quantized_type; + + int chunk_size = 32 * 512; + const int nchunk = (nelements + chunk_size - 1) / chunk_size; + const int nthread_use = + nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; + if (nthread_use < 2) { + new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, + hist_cur.data()); + } else { + size_t counter = 0; + new_size = 0; + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, + new_data, nelements, chunk_size]() { + std::vector local_hist; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + size_t first = counter; + counter += chunk_size; + if (first >= nelements) { + if (!local_hist.empty()) { + for (int j = 0; j < int(local_hist.size()); ++j) { + hist_cur[j] += local_hist[j]; + } + new_size += local_size; + } + break; + } + lock.unlock(); + size_t last = std::min(nelements, first + chunk_size); + if (local_hist.empty()) { + local_hist.resize(hist_cur.size(), 0); + } + local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, + last - first, local_hist.data()); + } + }; + if ((int)workers.size() < nthread_use - 1) { + workers.resize(nthread_use - 1); + } + for (int it = 0; it < nthread_use - 1; ++it) { + workers[it] = std::thread(compute); + } + compute(); + for (int it = 0; it < nthread_use - 1; ++it) { + workers[it].join(); + } + } + ggml_tensor *ret = + ggml_new_tensor_2d(ctx0, new_type, tensor->ne[0], tensor->ne[1]); + memcpy(ret->data, new_data, new_size); + return ret; +} + // // interface implementation // @@ -2921,6 +3031,52 @@ int llama_eval( return 0; } +int llama_eval_float( + struct llama_context * ctx, + const float * input, + int n_tokens, + int n_past, + int n_threads) { + const auto & model = ctx->model; + + const int64_t t_start_us = ggml_time_us(); + + const int N = n_tokens; + + auto & buf_compute = ctx->buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.addr, + /*.no_alloc =*/ false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + + struct ggml_tensor *input_f = + ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, N * model.hparams.n_embd); + memcpy(input_f->data, input, + N * model.hparams.n_embd * ggml_element_size(input_f)); + struct ggml_tensor *inpL = + quantize_float_tensor(ctx0, input_f, model.hparams.ftype, n_threads); + + ; + if (!llama_eval_internal_tensor(*ctx, ctx0, inpL, N, n_past, n_threads, t_start_us)) { + fprintf(stderr, "%s: failed to eval\n", __func__); + return 1; + } + + // get a more accurate load time, upon first eval + // TODO: fix this + if (!ctx->has_evaluated_once) { + ctx->t_load_us = ggml_time_us() - ctx->t_start_us; + ctx->has_evaluated_once = true; + } + + return 0; +} + int llama_tokenize( struct llama_context * ctx, const char * text, diff --git a/llama.h b/llama.h index c6b0a2889..3b984845c 100644 --- a/llama.h +++ b/llama.h @@ -173,6 +173,13 @@ extern "C" { int n_past, int n_threads); + LLAMA_API int llama_eval_float( + struct llama_context * ctx, + const float * embds, + int n_tokens, + int n_past, + int n_threads); + // Convert the provided text into tokens. // The tokens pointer must be large enough to hold the resulting tokens. // Returns the number of tokens on success, no more than n_max_tokens