fixed inpL shape and type

2023-06-05 21:39:35 +08:00 · 2023-06-05 21:39:35 +08:00 · 5673a8de37
commit 5673a8de37
parent 50ce29667f
1 changed files with 3 additions and 98 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -2234,95 +2234,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 }


-ggml_tensor *quantize_float_tensor(ggml_context *ctx0, ggml_tensor* tensor,
-                                   llama_ftype ftype, int nthread) {
-
-  ggml_type quantized_type;
-  switch (ftype) {
-  case LLAMA_FTYPE_MOSTLY_Q4_0:
-    quantized_type = GGML_TYPE_Q4_0;
-    break;
-  case LLAMA_FTYPE_MOSTLY_Q4_1:
-    quantized_type = GGML_TYPE_Q4_1;
-    break;
-  case LLAMA_FTYPE_MOSTLY_Q5_0:
-    quantized_type = GGML_TYPE_Q5_0;
-    break;
-  case LLAMA_FTYPE_MOSTLY_Q5_1:
-    quantized_type = GGML_TYPE_Q5_1;
-    break;
-  case LLAMA_FTYPE_MOSTLY_Q8_0:
-    quantized_type = GGML_TYPE_Q8_0;
-    break;
-  default:
-    throw format("invalid output file type %d\n", ftype);
-  };
-  void *new_data;
-  size_t new_size;
-  llama_buffer work;
-  float *f32_data;
-  size_t nelements = tensor->ne[0] * tensor->ne[1];
-  llama_buffer f32_conv_buf;
-  f32_data = (float *)tensor->data;
-  work.resize(nelements * 4);
-  new_data = work.addr;
-  std::vector<int64_t> hist_cur(1 << 4, 0);
-  std::vector<std::thread> workers;
-  std::mutex mutex;
-  enum ggml_type new_type = quantized_type;
-
-  int chunk_size = 32 * 512;
-  const int nchunk = (nelements + chunk_size - 1) / chunk_size;
-  const int nthread_use =
-      nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
-  if (nthread_use < 2) {
-    new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements,
-                                   hist_cur.data());
-  } else {
-    size_t counter = 0;
-    new_size = 0;
-    auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data,
-                    new_data, nelements, chunk_size]() {
-      std::vector<int64_t> local_hist;
-      size_t local_size = 0;
-      while (true) {
-        std::unique_lock<std::mutex> lock(mutex);
-        size_t first = counter;
-        counter += chunk_size;
-        if (first >= nelements) {
-          if (!local_hist.empty()) {
-            for (int j = 0; j < int(local_hist.size()); ++j) {
-              hist_cur[j] += local_hist[j];
-            }
-            new_size += local_size;
-          }
-          break;
-        }
-        lock.unlock();
-        size_t last = std::min(nelements, first + chunk_size);
-        if (local_hist.empty()) {
-          local_hist.resize(hist_cur.size(), 0);
-        }
-        local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first,
-                                          last - first, local_hist.data());
-      }
-    };
-    if ((int)workers.size() < nthread_use - 1) {
-      workers.resize(nthread_use - 1);
-    }
-    for (int it = 0; it < nthread_use - 1; ++it) {
-      workers[it] = std::thread(compute);
-    }
-    compute();
-    for (int it = 0; it < nthread_use - 1; ++it) {
-      workers[it].join();
-    }
-  }
-  ggml_tensor *ret =
-      ggml_new_tensor_2d(ctx0, new_type, tensor->ne[0], tensor->ne[1]);
-  memcpy(ret->data, new_data, new_size);
-  return ret;
-}

 //
 // interface implementation
@ -3053,15 +2964,9 @@ int llama_eval_float(

    struct ggml_context * ctx0 = ggml_init(params);

-
-    struct ggml_tensor *input_f =
-      ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, N * model.hparams.n_embd);
-  memcpy(input_f->data, input,
-         N * model.hparams.n_embd * ggml_element_size(input_f));
-  struct ggml_tensor *inpL =
-      quantize_float_tensor(ctx0, input_f, model.hparams.ftype, n_threads);
-
-    ;
+    struct ggml_tensor *inpL =
+      ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, model.hparams.n_embd, N);
+    memcpy(inpL->data, input, N * model.hparams.n_embd * ggml_element_size(inpL));
    if (!llama_eval_internal_tensor(*ctx, ctx0, inpL, N, n_past, n_threads, t_start_us)) {
        fprintf(stderr, "%s: failed to eval\n", __func__);
        return 1;