From 5673a8de37b453ce1646a12bf3f0442956b68c02 Mon Sep 17 00:00:00 2001 From: ningshanwutuobang Date: Mon, 5 Jun 2023 21:39:35 +0800 Subject: [PATCH] fixed inpL shape and type --- llama.cpp | 101 ++---------------------------------------------------- 1 file changed, 3 insertions(+), 98 deletions(-) diff --git a/llama.cpp b/llama.cpp index fff90a143..c76b19812 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2234,95 +2234,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } -ggml_tensor *quantize_float_tensor(ggml_context *ctx0, ggml_tensor* tensor, - llama_ftype ftype, int nthread) { - - ggml_type quantized_type; - switch (ftype) { - case LLAMA_FTYPE_MOSTLY_Q4_0: - quantized_type = GGML_TYPE_Q4_0; - break; - case LLAMA_FTYPE_MOSTLY_Q4_1: - quantized_type = GGML_TYPE_Q4_1; - break; - case LLAMA_FTYPE_MOSTLY_Q5_0: - quantized_type = GGML_TYPE_Q5_0; - break; - case LLAMA_FTYPE_MOSTLY_Q5_1: - quantized_type = GGML_TYPE_Q5_1; - break; - case LLAMA_FTYPE_MOSTLY_Q8_0: - quantized_type = GGML_TYPE_Q8_0; - break; - default: - throw format("invalid output file type %d\n", ftype); - }; - void *new_data; - size_t new_size; - llama_buffer work; - float *f32_data; - size_t nelements = tensor->ne[0] * tensor->ne[1]; - llama_buffer f32_conv_buf; - f32_data = (float *)tensor->data; - work.resize(nelements * 4); - new_data = work.addr; - std::vector hist_cur(1 << 4, 0); - std::vector workers; - std::mutex mutex; - enum ggml_type new_type = quantized_type; - - int chunk_size = 32 * 512; - const int nchunk = (nelements + chunk_size - 1) / chunk_size; - const int nthread_use = - nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; - if (nthread_use < 2) { - new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, - hist_cur.data()); - } else { - size_t counter = 0; - new_size = 0; - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, - new_data, nelements, chunk_size]() { - std::vector local_hist; - size_t local_size = 0; - while (true) { - std::unique_lock lock(mutex); - size_t first = counter; - counter += chunk_size; - if (first >= nelements) { - if (!local_hist.empty()) { - for (int j = 0; j < int(local_hist.size()); ++j) { - hist_cur[j] += local_hist[j]; - } - new_size += local_size; - } - break; - } - lock.unlock(); - size_t last = std::min(nelements, first + chunk_size); - if (local_hist.empty()) { - local_hist.resize(hist_cur.size(), 0); - } - local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, - last - first, local_hist.data()); - } - }; - if ((int)workers.size() < nthread_use - 1) { - workers.resize(nthread_use - 1); - } - for (int it = 0; it < nthread_use - 1; ++it) { - workers[it] = std::thread(compute); - } - compute(); - for (int it = 0; it < nthread_use - 1; ++it) { - workers[it].join(); - } - } - ggml_tensor *ret = - ggml_new_tensor_2d(ctx0, new_type, tensor->ne[0], tensor->ne[1]); - memcpy(ret->data, new_data, new_size); - return ret; -} // // interface implementation @@ -3053,15 +2964,9 @@ int llama_eval_float( struct ggml_context * ctx0 = ggml_init(params); - - struct ggml_tensor *input_f = - ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, N * model.hparams.n_embd); - memcpy(input_f->data, input, - N * model.hparams.n_embd * ggml_element_size(input_f)); - struct ggml_tensor *inpL = - quantize_float_tensor(ctx0, input_f, model.hparams.ftype, n_threads); - - ; + struct ggml_tensor *inpL = + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, model.hparams.n_embd, N); + memcpy(inpL->data, input, N * model.hparams.n_embd * ggml_element_size(inpL)); if (!llama_eval_internal_tensor(*ctx, ctx0, inpL, N, n_past, n_threads, t_start_us)) { fprintf(stderr, "%s: failed to eval\n", __func__); return 1;