ggml : mul_mat_id use the same tensor for all the experts (#6387)

* ggml : update mul_mat_id to use the same tensor for all the experts

* update cuda

* minor

* update metal

* update test-backend-ops

* fix cuda

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* update convert.py

* update convert-hf-to-gguf.py

* update convert.py for mixtral hf models

* Update convert-hf-to-gguf.py

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* cuda : support non-pow-2 number of experts

* allow quantize to work for split and merged experts models in the same way

* cleanup + disable mmap automatically with split tensors models

* update imatrix

* test-backend-ops : test qwen argsort

* update grok model loading

* llama : add merged experts tensors to the grok tensor map

* minor

* gguf : bump version

* fix quantizing of merged experts

* convert-hf-to-gguf.py : update grok (untested)

* make linter happy

* cuda/argsort : use shared memory instead of pool memory

* convert : fix grok tensor names

* metal : add support for non-pow-2 argsort

* llama : more loader cleanup, better error checking

* cuda : fix warning

* llama : still use mmap for loading old models, but copy the data to a host buffer

* add review note

* llama : remove ffn tensor counting + add sanity check

ggml-ci

* convert : fix handling of n_experts == None

ggml-ci

* imatrix : fix ncall counters

* llama : produce error if imatrix size does not match

* quantize : terminate on errors + trace logs

ggml-ci

* metal : pad shared memory to 16 bytes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
slaren 2024-04-03 15:07:05 +02:00 committed by GitHub
parent 52604860f9
commit 08a0c02060
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 756 additions and 888 deletions

View file

@ -828,6 +828,15 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)
def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
def load() -> Tensor:
tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
s = lazy_tensors[0].shape.copy()
s.insert(0, len(lazy_tensors))
return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))
# Functionality that simulates `torch.load` but where individual tensors are
# only loaded into memory on demand, not all at once.
# PyTorch can't do this natively as of time of writing:
@ -1246,6 +1255,22 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
tmp = model
# merge experts into one tensor
if params.n_experts and params.n_experts > 0:
for i_l in range(params.n_layer):
for w in range(1, 4):
experts = []
for e in range(params.n_experts):
if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
else:
raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)
# HF models permut or pack some of the tensors, so we need to undo that
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model: