llama : don't zero-init vectors in quantize -> 5.1% faster
This commit is contained in:
parent
a95aa21dad
commit
f727ad5fc9
1 changed files with 10 additions and 4 deletions
14
llama.cpp
14
llama.cpp
|
@ -4639,8 +4639,14 @@ void llama_beam_search(llama_context * ctx,
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
struct no_init {
|
||||||
|
T value;
|
||||||
|
no_init() { /* do nothing */ }
|
||||||
|
};
|
||||||
|
|
||||||
static void llama_convert_tensor_internal(
|
static void llama_convert_tensor_internal(
|
||||||
struct ggml_tensor * tensor, std::vector<float> & output, std::vector<std::thread> & workers,
|
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
|
||||||
const size_t nelements, const int nthread
|
const size_t nelements, const int nthread
|
||||||
) {
|
) {
|
||||||
if (output.size() < nelements) {
|
if (output.size() < nelements) {
|
||||||
|
@ -4895,9 +4901,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
int idx = 0;
|
int idx = 0;
|
||||||
|
|
||||||
std::vector<uint8_t> read_data;
|
std::vector<no_init<uint8_t>> read_data;
|
||||||
std::vector<uint8_t> work;
|
std::vector<no_init<uint8_t>> work;
|
||||||
std::vector<float> f32_conv_buf;
|
std::vector<no_init<float>> f32_conv_buf;
|
||||||
|
|
||||||
// populate the original tensors so we get an initial meta data
|
// populate the original tensors so we get an initial meta data
|
||||||
for (int i = 0; i < ml->n_tensors; ++i) {
|
for (int i = 0; i < ml->n_tensors; ++i) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue