llama : don't zero-init vectors in quantize -> 5.1% faster

2023-09-09 17:30:16 -04:00 · 2023-09-09 17:30:16 -04:00 · f727ad5fc9
commit f727ad5fc9
parent a95aa21dad
1 changed files with 10 additions and 4 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -4639,8 +4639,14 @@ void llama_beam_search(llama_context * ctx,
 // quantization
 //
 template <typename T>
 struct no_init {
    T value;
    no_init() { /* do nothing */ }
 };
 static void llama_convert_tensor_internal(
-    struct ggml_tensor * tensor, std::vector<float> & output, std::vector<std::thread> & workers,
+    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
    const size_t nelements, const int nthread
 ) {
    if (output.size() < nelements) {
@ -4895,9 +4901,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
    int idx = 0;
-    std::vector<uint8_t> read_data;
+    std::vector<no_init<uint8_t>> read_data;
-    std::vector<uint8_t> work;
+    std::vector<no_init<uint8_t>> work;
-    std::vector<float> f32_conv_buf;
+    std::vector<no_init<float>> f32_conv_buf;
    // populate the original tensors so we get an initial meta data
    for (int i = 0; i < ml->n_tensors; ++i) {