ggml : mul_mat_id use the same tensor for all the experts (#6387)
* ggml : update mul_mat_id to use the same tensor for all the experts * update cuda * minor * update metal * update test-backend-ops * fix cuda * Update ggml-metal.m Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * update convert.py * update convert-hf-to-gguf.py * update convert.py for mixtral hf models * Update convert-hf-to-gguf.py Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * cuda : support non-pow-2 number of experts * allow quantize to work for split and merged experts models in the same way * cleanup + disable mmap automatically with split tensors models * update imatrix * test-backend-ops : test qwen argsort * update grok model loading * llama : add merged experts tensors to the grok tensor map * minor * gguf : bump version * fix quantizing of merged experts * convert-hf-to-gguf.py : update grok (untested) * make linter happy * cuda/argsort : use shared memory instead of pool memory * convert : fix grok tensor names * metal : add support for non-pow-2 argsort * llama : more loader cleanup, better error checking * cuda : fix warning * llama : still use mmap for loading old models, but copy the data to a host buffer * add review note * llama : remove ffn tensor counting + add sanity check ggml-ci * convert : fix handling of n_experts == None ggml-ci * imatrix : fix ncall counters * llama : produce error if imatrix size does not match * quantize : terminate on errors + trace logs ggml-ci * metal : pad shared memory to 16 bytes --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
52604860f9
commit
08a0c02060
15 changed files with 756 additions and 888 deletions
|
@ -98,35 +98,38 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|||
|
||||
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
|
||||
|
||||
// this has been adapted to the new format of storing merged experts in a single 3d tensor
|
||||
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
|
||||
if (t->op == GGML_OP_MUL_MAT_ID) {
|
||||
const int idx = ((int32_t *) t->op_params)[0];
|
||||
const int n_as = ((int32_t *) t->op_params)[1];
|
||||
const ggml_tensor * ids = t->src[2];
|
||||
const int n_as = src0->ne[2];
|
||||
|
||||
// the top-k selected expert ids are stored in the src0 tensor
|
||||
// for simplicity, always copy src0 to host, because it is small
|
||||
// take into account that src0 is not contiguous!
|
||||
GGML_ASSERT(src0->ne[1] == src1->ne[1]);
|
||||
GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
|
||||
m_ids.resize(ggml_nbytes(src0)/sizeof(int));
|
||||
ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
|
||||
// the top-k selected expert ids are stored in the ids tensor
|
||||
// for simplicity, always copy ids to host, because it is small
|
||||
// take into account that ids is not contiguous!
|
||||
GGML_ASSERT(ids->ne[1] == src1->ne[1]);
|
||||
GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
|
||||
m_ids.resize(ggml_nbytes(ids)/sizeof(int));
|
||||
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
|
||||
|
||||
auto & e = m_stats[wname];
|
||||
|
||||
++e.ncall;
|
||||
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
||||
// using the following line, we can correct for that if needed by replacing the line above with:
|
||||
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
||||
|
||||
// loop over all possible experts, regardless if they are used or not in the batch
|
||||
// this is necessary to guarantee equal number of "ncall" for each tensor
|
||||
for (int ex = 0; ex < n_as; ++ex) {
|
||||
src0 = t->src[2 + ex];
|
||||
wname = filter_tensor_name(src0->name);
|
||||
auto& e = m_stats[wname];
|
||||
size_t e_start = ex*src1->ne[0];
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0], 0);
|
||||
e.values.resize(src1->ne[0]*n_as, 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
||||
exit(1); //GGML_ASSERT(false);
|
||||
}
|
||||
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
||||
// using the following line, we can correct for that if needed
|
||||
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
||||
++e.ncall;
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
}
|
||||
|
@ -136,7 +139,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
|||
if (excur != ex) continue;
|
||||
const float * x = data + row * src1->ne[0];
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[j] += x[j]*x[j];
|
||||
e.values[e_start + j] += x[j]*x[j];
|
||||
}
|
||||
}
|
||||
if (e.ncall > m_last_call) {
|
||||
|
|
|
@ -116,13 +116,13 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
|
|||
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
|
||||
if (!in) {
|
||||
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
|
||||
return;
|
||||
exit(1);
|
||||
}
|
||||
int n_entries;
|
||||
in.read((char *)&n_entries, sizeof(n_entries));
|
||||
if (in.fail() || n_entries < 1) {
|
||||
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
|
||||
return;
|
||||
exit(1);
|
||||
}
|
||||
for (int i = 0; i < n_entries; ++i) {
|
||||
int len; in.read((char *)&len, sizeof(len));
|
||||
|
@ -130,11 +130,11 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
|
|||
in.read((char *)name_as_vec.data(), len);
|
||||
if (in.fail()) {
|
||||
printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
|
||||
return;
|
||||
exit(1);
|
||||
}
|
||||
name_as_vec[len] = 0;
|
||||
std::string name{name_as_vec.data()};
|
||||
auto & e = imatrix_data[std::move(name)];
|
||||
auto & e = imatrix_data[name];
|
||||
int ncall;
|
||||
in.read((char *)&ncall, sizeof(ncall));
|
||||
int nval;
|
||||
|
@ -142,18 +142,22 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
|
|||
if (in.fail() || nval < 1) {
|
||||
printf("%s: failed reading number of values for entry %d\n", __func__, i);
|
||||
imatrix_data = {};
|
||||
return;
|
||||
exit(1);
|
||||
}
|
||||
e.resize(nval);
|
||||
in.read((char *)e.data(), nval*sizeof(float));
|
||||
if (in.fail()) {
|
||||
printf("%s: failed reading data for entry %d\n", __func__, i);
|
||||
imatrix_data = {};
|
||||
return;
|
||||
exit(1);
|
||||
}
|
||||
if (ncall > 0) {
|
||||
for (auto& v : e) v /= ncall;
|
||||
}
|
||||
|
||||
if (getenv("LLAMA_TRACE")) {
|
||||
printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
|
||||
}
|
||||
}
|
||||
printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue