merge PowerInfer impl from the internal codebase

2023-12-12 11:05:32 +08:00 · 2023-12-12 11:05:32 +08:00 · a3c295a2ae
commit a3c295a2ae
parent 6bb4908a17
17 changed files with 4515 additions and 169 deletions
--- a/.gitignore
+++ b/.gitignore
@ -98,3 +98,5 @@ tests/test-tokenizer-0-llama
 tests/test-tokenizer-0-falcon
 tests/test-tokenizer-1-llama
 tests/test-tokenizer-1-bpe
 build-info.h
--- a/common/common.cpp
+++ b/common/common.cpp
@ -471,6 +471,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.lora_base = argv[i];
        } else if (arg == "--mlp-adapter") {
            if (++i >= argc) {
                invalid_param = true;
                break;
            }
            params.mlp_adapter = argv[i];
        } else if (arg == "--mmproj") {
            if (++i >= argc) {
                invalid_param = true;
@ -950,8 +956,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        return std::make_tuple(nullptr, nullptr);
    }
-    auto cparams = llama_context_params_from_gpt_params(params);
+    if (llama_use_sparse_inference(model)) {
        fprintf(stderr, "%s: postprocessing PowerInfer model '%s'\n", __func__, params.model.c_str());
        if (!params.mlp_adapter.empty()) {
            fprintf(stderr, "%s: warning: --mlp-adapter is deprecated and has no effect\n", __func__);
            int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true);
            if (err != 0) {
                fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
                llama_free_model(model);
                return std::make_tuple(nullptr, nullptr);
            }
        }
        if (llama_model_apply_augmentation(model) != 0) {
            fprintf(stderr, "%s: error: failed to apply augmentation\n", __func__);
            llama_free_model(model);
            return std::make_tuple(nullptr, nullptr);
        }
    }
    auto cparams = llama_context_params_from_gpt_params(params);
    llama_context * lctx = llama_new_context_with_model(model, cparams);
    if (lctx == NULL) {
        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
@ -981,6 +1005,8 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
        params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
    }
    {
        LOG("warming up the model with an empty run\n");
@ -1320,6 +1346,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
        fprintf(stream, "  - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
    }
    fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
    fprintf(stream, "mlp_adapter: %s\n", params.mlp_adapter.c_str());
    fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
    fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
--- a/common/common.h
+++ b/common/common.h
@ -90,6 +90,8 @@ struct gpt_params {
    std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
    std::string lora_base  = "";                              // base model path for the lora adapter
    std::string mlp_adapter = "";  // sparse activation mlp adapter path
    int  ppl_stride        = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int  ppl_output_type   = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
                                    //                                       (which is more convenient to use for plotting)
--- a/convert-hf-to-powerinfer-gguf.py
+++ b/convert-hf-to-powerinfer-gguf.py
@ -0,0 +1,601 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 from abc import ABC, abstractmethod
 import argparse
 import contextlib
 import json
 import os
 import re
 import struct
 import sys
 from enum import IntEnum
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Optional, cast
 import numpy as np
 import torch
 import torch.nn as tnn
 if TYPE_CHECKING:
    from torch import Tensor
 if "NO_LOCAL_GGUF" not in os.environ:
    sys.path.insert(1, str(Path(__file__).parent / "gguf-py"))
 import gguf
 ###### MODEL DEFINITIONS ######
 class SentencePieceTokenTypes(IntEnum):
    NORMAL = 1
    UNKNOWN = 2
    CONTROL = 3
    USER_DEFINED = 4
    UNUSED = 5
    BYTE = 6
 class ReluMLP(tnn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super(ReluMLP, self).__init__()
        self.fc1 = tnn.Linear(input_dim, hidden_dim, bias=False)
        self.relu = tnn.ReLU()
        self.fc2 = tnn.Linear(hidden_dim, output_dim, bias=False)
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x
    @staticmethod
    def from_file(model_file: Path):
        model = torch.load(model_file, map_location="cpu")
        hidden_size, input_size = model.get("fc1.weight").shape
        output_size, _ = model.get("fc2.weight").shape
        mlp = ReluMLP(input_size, hidden_size, output_size)
        mlp.load_state_dict(model)
        return mlp
 class Model(ABC):
    """Base class for model conversion"""
    def __init__(
        self,
        dir_model: Path,
        dir_mlp_pred: Path,
        ftype: int,
        fname_out: Path,
        is_big_endian: bool,
    ):
        self.dir_model = dir_model
        self.dir_mlp_pred = dir_mlp_pred
        self.ftype = ftype
        self.fname_out = fname_out
        self.is_big_endian = is_big_endian
        self.endianess = (
            gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
        )
        self.is_safetensors = self._is_model_safetensors()
        self.num_parts = Model.count_model_parts(
            self.dir_model, ".safetensors" if self.is_safetensors else ".bin"
        )
        self.part_names = self._get_part_names()
        self.hparams = Model.load_hparams(self.dir_model)
        self.model_arch = self._get_model_architecture()
        self.gguf_writer = gguf.GGUFWriter(
            fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file = False
        )
    def set_vocab(self):
        self._set_vocab_gpt2()
    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
        for model_layer, part_name in self._get_mlp_part_layer_names():
            print(f"gguf: loading mlp part '{part_name}'")
            mlp_model = ReluMLP.from_file(self.dir_mlp_pred / part_name)
            for name, data in mlp_model.state_dict().items():
                yield f"blk.{model_layer}.{name}", data
        for part_name in self.part_names:
            print(f"gguf: loading model part '{part_name}'")
            ctx: ContextManager[Any]
            if self.is_safetensors:
                from safetensors import safe_open
                ctx = cast(
                    ContextManager[Any],
                    safe_open(self.dir_model / part_name, framework="pt", device="cpu"),
                )
            else:
                ctx = contextlib.nullcontext(
                    torch.load(self.dir_model / part_name, map_location="cpu")
                )
            with ctx as model_part:
                for name in model_part.keys():
                    data = (
                        model_part.get_tensor(name)
                        if self.is_safetensors
                        else model_part[name]
                    )
                    yield name, data
    @abstractmethod
    def set_gguf_parameters(self):
        pass
        # self.gguf_writer.add_name(self.dir_model.name)
        # self.gguf_writer.add_block_count(
        #     self.hparams.get(
        #         "n_layers",
        #         self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
        #     )
        # )
        # if (n_ctx := self.hparams.get("max_position_embeddings")) is not None:
        #     self.gguf_writer.add_context_length(n_ctx)
        # if (n_embd := self.hparams.get("hidden_size")) is not None:
        #     self.gguf_writer.add_embedding_length(n_embd)
        # if (n_ff := self.hparams.get("intermediate_size")) is not None:
        #     self.gguf_writer.add_feed_forward_length(n_ff)
        # if (n_head := self.hparams.get("num_attention_head")) is not None:
        #     self.gguf_writer.add_head_count(n_head)
        # self.gguf_writer.add_parallel_residual(
        #     self.hparams.get("use_parallel_residual", True)
        # )
    @abstractmethod
    def write_tensors(self):
        pass
    def write(self):
        self.write_tensors()
        self.gguf_writer.write_header_to_file()
        self.gguf_writer.write_kv_data_to_file()
        self.gguf_writer.write_tensors_to_file()
        self.gguf_writer.close()
    def write_vocab(self):
        self.gguf_writer.write_header_to_file()
        self.gguf_writer.write_kv_data_to_file()
        self.gguf_writer.close()
    @staticmethod
    def count_model_parts(dir_model: Path, prefix: str) -> int:
        num_parts = 0
        for filename in os.listdir(dir_model):
            if filename.endswith(prefix):
                num_parts += 1
        return num_parts
    @staticmethod
    def load_hparams(dir_model):
        with open(dir_model / "config.json", "r", encoding="utf-8") as f:
            return json.load(f)
    @staticmethod
    def from_model_architecture(model_architecture):
        if model_architecture in ("FalconForCausalLM", "RWForCausalLM"):
            return FalconModel
        if model_architecture == "LlamaForCausalLM":
            return LlamaModel
        raise NotImplementedError(f'Architecture "{model_architecture}" not supported!')
    def _is_model_safetensors(self) -> bool:
        return Model.count_model_parts(self.dir_model, ".safetensors") > 0
    def _get_mlp_part_layer_names(self):
        """Returns a generator of (index, name) for MLP predictors of each model layer"""
        n_mlp_parts = Model.count_model_parts(self.dir_mlp_pred, ".pt")
        return ((n, f"model_{n}.pt") for n in range(n_mlp_parts))
    def _get_part_names(self):
        if self.is_safetensors:
            if self.num_parts == 1:  # there's only one .safetensors file
                return ("model.safetensors",)
            return (
                f"model-{n:05}-of-{self.num_parts:05}.safetensors"
                for n in range(1, self.num_parts + 1)
            )
        if self.num_parts == 1:  # there's only one .bin file
            return ("pytorch_model.bin",)
        return (
            f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin"
            for n in range(1, self.num_parts + 1)
        )
    def _get_model_architecture(self) -> gguf.MODEL_ARCH:
        arch = self.hparams["architectures"][0]
        if arch == "FalconForCausalLM":
            return gguf.MODEL_ARCH.FALCON
        if arch == "RWForCausalLM" or arch == "LlamaForCausalLM":
            return gguf.MODEL_ARCH.LLAMA
        raise NotImplementedError(f'Architecture "{arch}" not supported!')
    def _translate_tensor_key(
        self, key: str, try_suffixes=(".weight", ".bias")
    ) -> Optional[str]:
        block_count = self.hparams.get(
            "n_layers",
            self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")),
        )
        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
        arch_tensor_key = tensor_map.get_name(key, try_suffixes=try_suffixes)
        if arch_tensor_key is not None:
            return arch_tensor_key
        # check and handle ReluMLP layers
        mlp_match = re.match(r"^blk\.\d+\.fc\d\.weight$", key)
        if mlp_match:
            return mlp_match.group(0)
        return None
    def _set_vocab_gpt2(self):
        dir_model = self.dir_model
        hparams = self.hparams
        tokens: list[bytearray] = []
        toktypes: list[int] = []
        from transformers import AutoTokenizer  # type: ignore[attr-defined]
        tokenizer = AutoTokenizer.from_pretrained(dir_model)
        vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
        assert max(tokenizer.vocab.values()) < vocab_size
        reverse_vocab = {
            id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()
        }
        added_vocab = tokenizer.get_added_vocab()
        for i in range(vocab_size):
            if i not in reverse_vocab:
                pad_token = f"[PAD{i}]".encode("utf-8")
                tokens.append(bytearray(pad_token))
                toktypes.append(gguf.TokenType.USER_DEFINED)
            elif reverse_vocab[i] in added_vocab:
                tokens.append(reverse_vocab[i])
                if tokenizer.added_tokens_decoder[i].special:
                    toktypes.append(gguf.TokenType.CONTROL)
                else:
                    toktypes.append(gguf.TokenType.USER_DEFINED)
            else:
                tokens.append(reverse_vocab[i])
                toktypes.append(gguf.TokenType.NORMAL)
        self.gguf_writer.add_tokenizer_model("gpt2")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
        special_vocab.add_to_gguf(self.gguf_writer)
    def _set_vocab_sentencepiece(self):
        from sentencepiece import SentencePieceProcessor
        tokenizer_path = self.dir_model / "tokenizer.model"
        tokens: list[bytes] = []
        scores: list[float] = []
        toktypes: list[int] = []
        if not tokenizer_path.is_file():
            print(f"Error: Missing {tokenizer_path}", file=sys.stderr)
            sys.exit(1)
        tokenizer = SentencePieceProcessor(str(tokenizer_path))
        vocab_size = self.hparams.get("vocab_size", tokenizer.vocab_size())
        for token_id in range(vocab_size):
            piece = tokenizer.id_to_piece(token_id)
            text = piece.encode("utf-8")
            score = tokenizer.get_score(token_id)
            toktype = SentencePieceTokenTypes.NORMAL
            if tokenizer.is_unknown(token_id):
                toktype = SentencePieceTokenTypes.UNKNOWN
            elif tokenizer.is_control(token_id):
                toktype = SentencePieceTokenTypes.CONTROL
            elif tokenizer.is_unused(token_id):
                toktype = SentencePieceTokenTypes.UNUSED
            elif tokenizer.is_byte(token_id):
                toktype = SentencePieceTokenTypes.BYTE
            tokens.append(text)
            scores.append(score)
            toktypes.append(toktype)
        added_tokens_file = self.dir_model / "added_tokens.json"
        if added_tokens_file.is_file():
            with open(added_tokens_file, "r", encoding="utf-8") as f:
                added_tokens_json = json.load(f)
                for key in added_tokens_json:
                    tokens.append(key.encode("utf-8"))
                    scores.append(-1000.0)
                    toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
        self.gguf_writer.add_tokenizer_model("llama")
        self.gguf_writer.add_token_list(tokens)
        self.gguf_writer.add_token_scores(scores)
        self.gguf_writer.add_token_types(toktypes)
        special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
        special_vocab.add_to_gguf(self.gguf_writer)
 class LlamaModel(Model):
    def set_vocab(self):
        self._set_vocab_sentencepiece()
    def set_gguf_parameters(self):
        self.gguf_writer.add_name("Llama")
        self.gguf_writer.add_context_length(2048)  # not in config.json
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_block_count(self.hparams["num_hidden_layers"])
        self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
        self.gguf_writer.add_rope_dimension_count(
            self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
        )
        self.gguf_writer.add_head_count(self.hparams["num_attention_heads"])
        self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"])
        self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
        self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
        self.gguf_writer.add_file_type(self.ftype)
    def write_tensors(self):
        for name, data_torch in self.get_tensors():
            # we don't need these
            if name.endswith(
                (
                    ".attention.masked_bias",
                    ".attention.bias",
                    ".attention.rotary_emb.inv_freq",
                )
            ):
                continue
            old_dtype = data_torch.dtype
            # convert any unsupported data types to float32
            if data_torch.dtype not in (torch.float16, torch.float32):
                data_torch = data_torch.to(torch.float32)
            data = data_torch.squeeze().numpy()
            # map tensor names
            new_name = self._translate_tensor_key(name)
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
            # We need to transpose the weight matrices for the FFN Down layers to support the
            # Axpy operation in PowerInfer. So we don't need to transpose them at runtime.
            if "ffn_down" in new_name:
                new_name = new_name.replace("ffn_down", "ffn_down_t")
                data = data.T
            n_dims = len(data.shape)
            data_dtype = data.dtype
            # if f32 desired, convert any float16 to float32
            if self.ftype == 0 and data_dtype == np.float16:
                data = data.astype(np.float32)
            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
                data = data.astype(np.float32)
            # if f16 desired, convert any float32 2-dim weight tensors to float16
            if (
                self.ftype == 1
                and data_dtype == np.float32
                and name.endswith(".weight")
                and n_dims == 2
            ):
                data = data.astype(np.float16)
            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)
 class FalconModel(Model):
    def set_gguf_parameters(self):
        block_count = self.hparams.get("num_hidden_layers")
        if block_count is None:
            block_count = self.hparams["n_layer"]  # old name
        n_head = self.hparams.get("num_attention_heads")
        if n_head is None:
            n_head = self.hparams["n_head"]  # old name
        n_head_kv = self.hparams.get("num_kv_heads")
        if n_head_kv is None:
            n_head_kv = self.hparams.get("n_head_kv", 1)  # old name
        self.gguf_writer.add_name("Falcon")
        self.gguf_writer.add_context_length(2048)  # not in config.json
        self.gguf_writer.add_tensor_data_layout("jploski")  # qkv tensor transform
        self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
        self.gguf_writer.add_feed_forward_length(4 * self.hparams["hidden_size"])
        self.gguf_writer.add_block_count(block_count)
        self.gguf_writer.add_head_count(n_head)
        self.gguf_writer.add_head_count_kv(n_head_kv)
        self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
        self.gguf_writer.add_file_type(self.ftype)
    def write_tensors(self):
        n_head = self.hparams.get("num_attention_heads")
        if n_head is None:
            n_head = self.hparams["n_head"]  # old name
        n_head_kv = self.hparams.get("num_kv_heads")
        if n_head_kv is None:
            n_head_kv = self.hparams.get("n_head_kv", 1)  # old name
        head_dim = self.hparams["hidden_size"] // n_head
        for name, data_torch in self.get_tensors():
            old_dtype = data_torch.dtype
            # convert any unsupported data types to float32
            if data_torch.dtype not in (torch.float16, torch.float32):
                data_torch = data_torch.to(torch.float32)
            # QKV tensor transform
            # The original query_key_value tensor contains n_head_kv "kv groups",
            # each consisting of n_head/n_head_kv query weights followed by one key
            # and one value weight (shared by all query heads in the kv group).
            # This layout makes it a big pain to work with in GGML.
            # So we rearrange them here,, so that we have n_head query weights
            # followed by n_head_kv key weights followed by n_head_kv value weights,
            # in contiguous fashion.
            # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
            if "query_key_value" in name:
                qkv = data_torch.view(
                    n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head
                )
                q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head)
                k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
                v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
                data_torch = torch.cat((q, k, v)).reshape_as(data_torch)
            data = data_torch.squeeze().numpy()
            # map tensor names
            new_name = self._translate_tensor_key(name)
            if new_name is None:
                print(f"Can not map tensor {name!r}")
                sys.exit()
            # We need to transpose the weight matrices for the FFN Down layers to support the
            # Axpy operation in PowerInfer. So we don't need to transpose them at runtime.
            if "ffn_down" in new_name:
                new_name = new_name.replace("ffn_down", "ffn_down_t")
                data = data.T
            n_dims = len(data.shape)
            data_dtype = data.dtype
            # if f32 desired, convert any float16 to float32
            if self.ftype == 0 and data_dtype == np.float16:
                data = data.astype(np.float32)
            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
                data = data.astype(np.float32)
            # if f16 desired, convert any float32 2-dim weight tensors to float16
            if (
                self.ftype == 1
                and data_dtype == np.float32
                and name.endswith(".weight")
                and n_dims == 2
            ):
                data = data.astype(np.float16)
            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
            self.gguf_writer.add_tensor(new_name, data)
 ###### CONVERSION LOGIC ######
 def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Convert a huggingface model to a GGML compatible file"
    )
    parser.add_argument(
        "--vocab-only",
        action="store_true",
        help="extract only the vocab",
    )
    parser.add_argument(
        "--outfile",
        type=Path,
        help="path to write to; default: based on input",
    )
    parser.add_argument(
        "--outtype",
        type=str,
        choices=["f32", "f16"],
        default="f16",
        help="output format - use f32 for float32, f16 for float16",
    )
    parser.add_argument(
        "--bigendian",
        action="store_true",
        help="model is executed on big endian machine",
    )
    parser.add_argument(
        "model",
        type=Path,
        help="directory containing model file",
    )
    parser.add_argument(
        "mlp_predictors",
        type=Path,
        help="directory containing MLP predictors for model",
    )
    return parser.parse_args()
 args = parse_args()
 dir_model = args.model
 dir_mlp_pred = args.mlp_predictors
 if not dir_model.is_dir():
    print(f"Error: {args.model} is not a directory", file=sys.stderr)
    sys.exit(1)
 if not dir_mlp_pred.is_dir():
    print(f"Error: {args.mlp_predictors} is not a directory", file=sys.stderr)
    sys.exit(1)
 ftype_map = {
    "f32": gguf.GGMLQuantizationType.F32,
    "f16": gguf.GGMLQuantizationType.F16,
 }
 if args.outfile is not None:
    fname_out = args.outfile
 else:
    # output in the same directory as the model by default
    fname_out = dir_model / f"ggml-model-{args.outtype}.gguf"
 print(f"Loading model: {dir_model.name}")
 hparams = Model.load_hparams(dir_model)
 model_class = Model.from_model_architecture(hparams["architectures"][0])
 model_instance = model_class(
    dir_model, dir_mlp_pred, ftype_map[args.outtype], fname_out, args.bigendian
 )
 print("Set model parameters")
 model_instance.set_gguf_parameters()
 print("Set model tokenizer")
 model_instance.set_vocab()
 if args.vocab_only:
    print(f"Exporting model vocab to '{fname_out}'")
    model_instance.write_vocab()
 else:
    print(f"Exporting model to '{fname_out}'")
    model_instance.write()
 # post-process: write another unique file header to distinguish from the origianl GGUF file
 with open(fname_out, "r+b") as fout:
    POWERINFER_MAGIC = int.from_bytes(b"PWRI", "little")
    fout.write(struct.pack("<I", POWERINFER_MAGIC))
 print(f"Model successfully exported to '{fname_out}'")
--- a/convert.py
+++ b/convert.py
@ -509,6 +509,14 @@ class LazyTensor:
        def load() -> Tensor:
            return self.load().astype(data_type)
        return LazyTensor(load, self.shape, data_type, f'convert({data_type}) {self.description}')
    def transposed(self) -> LazyTensor:
        def load() -> Tensor:
            loaded = self.load()
            assert isinstance(loaded, UnquantizedTensor), f'Cannot transpose {loaded}'
            loaded.ndarray = loaded.ndarray.T
            return loaded
        return LazyTensor(load, self.shape[::-1], self.data_type, f'transpose {self.description}')
    def validate_conversion_to(self, data_type: DataType) -> None:
        if data_type != self.data_type and data_type.name not in self.data_type.valid_conversions:
@ -571,7 +579,8 @@ def merge_multifile_models(models_plus: list[ModelPlus]) -> ModelPlus:
    except StopIteration:
        vocab = None
-    if any("model.embed_tokens.weight" in mp.model for mp in models_plus):
+    if any("model.embed_tokens.weight" in mp.model for mp in models_plus) or \
       any("model.layers.0.fc1.weight" in mp.model for mp in models_plus):
        # Transformers models put different tensors in different files, but
        # don't split indivdual tensors between files.
        model: LazyModel = {}
@ -992,6 +1001,18 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
    return out
 def postprocess_transpose(model: LazyModel) -> LazyModel:
    """Transpose ffn_down matrices for Axpy ops."""
    out: LazyModel = {}
    for name, lazy_tensor in model.items():
        if name.endswith(".ffn_down.weight"):
            out[name.replace("ffn_down", "ffn_down_t")] = lazy_tensor.transposed()
        else:
            out[name] = lazy_tensor
    return out
 def nth_multifile_path(path: Path, n: int) -> Path | None:
    '''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
    the nth path in the model.
@ -1003,7 +1024,9 @@ def nth_multifile_path(path: Path, n: int) -> Path | None:
        # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc.
        (r'-[0-9]{5}-of-(.*)$', fr'-{n:05}-of-\1'),
        # x.bin, x.bin.1, etc.
-        (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}')
+        (r'(\.[0-9]+)?$', r'\1' if n == 0 else fr'\1.{n}'),
        # x_0.pt, x_1.pt, etc.
        (r'(_[0-9]+)?\.pt$', fr'_{n}.pt'),
    ]
    for regex, replacement in patterns:
        if re.search(regex, path.name):
@ -1057,6 +1080,25 @@ def load_some_model(path: Path) -> ModelPlus:
    model_plus = merge_multifile_models(models_plus)
    return model_plus
 def load_mlp_model(path: Path) -> ModelPlus:
    '''Load MLP models for sparse attention from directory.'''
    assert path.is_dir(), f"MLP model path {path} is not a directory"
    first_model_path = path / "model_0.pt"
    assert first_model_path.resolve(), f"MLP model path {path} does not contain model_0.pt"
    model_paths = find_multifile_paths(first_model_path)
    models_plus: list[ModelPlus] = []
    for model_path in model_paths:
        # find number in model_path
        model_layer = int(re.search(r'model_(\d+).pt', str(model_path)).group(1))
        print(f"Loading MLP model file {model_path}")
        mlp_model = lazy_load_file(model_path)
        mlp_model.model = {f"model.layers.{model_layer}.{name}": tensor for name, tensor in mlp_model.model.items()}
        models_plus.append(mlp_model)
    return merge_multifile_models(models_plus)
 def load_vocab(path: Path, vocabtype: str | None) -> Vocab:
    # Be extra-friendly and accept either a file or a directory.  Also, if it's
@ -1125,6 +1167,7 @@ def main(args_in: list[str] | None = None) -> None:
    parser.add_argument("--vocab-dir",   type=Path,              help="directory containing tokenizer.model, if separate from model file")
    parser.add_argument("--outfile",     type=Path,              help="path to write to; default: based on input")
    parser.add_argument("model",         type=Path,              help="directory containing model file, or model file itself (*.pth, *.pt, *.bin, *.safetensors)")
    parser.add_argument("mlp_model",     type=Path,              help="MLP model for sparse attention")
    parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
    parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
    parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
@ -1138,6 +1181,8 @@ def main(args_in: list[str] | None = None) -> None:
    if not args.vocab_only:
        model_plus = load_some_model(args.model)
        mlp_predictor_plus = load_mlp_model(args.mlp_model)
        model_plus = merge_multifile_models([model_plus, mlp_predictor_plus])
    else:
        model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
@ -1192,6 +1237,7 @@ def main(args_in: list[str] | None = None) -> None:
    model   = model_plus.model
    model   = convert_model_names(model, params)
    model   = postprocess_transpose(model)
    ftype   = pick_output_type(model, args.outtype)
    model   = convert_to_output_type(model, ftype)
    outfile = args.outfile or default_outfile(model_plus.paths, ftype)
@ -1202,6 +1248,11 @@ def main(args_in: list[str] | None = None) -> None:
    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
    print(f"Wrote {outfile}")
    # post-process: write another unique file header to distinguish from the origianl GGUF file
    with open(outfile, "r+b") as fout:
        POWERINFER_MAGIC = int.from_bytes(b"PWRI", "little")
        fout.write(struct.pack("<I", POWERINFER_MAGIC))
 if __name__ == '__main__':
    main()
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -11,7 +11,7 @@ int main(int argc, char ** argv) {
    gpt_params params;
    if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL] [N_THREAD] [MLP_PATH]\n" , argv[0]);
        return 1 ;
    }
@ -44,6 +44,17 @@ int main(int argc, char ** argv) {
        n_gpu_layers = std::atoi(argv[5]);
    }
    if (argc >= 7) {
        params.n_threads = std::atoi(argv[6]);
    }
    if (argc >= 8) {
        params.mlp_adapter = argv[7];
    }
    printf("params: model = %s, prompt = %s, n_parallel = %d, n_len = %d, n_gpu_layers = %d, n_threads = %d, mlp_adapter = %s\n",
           params.model.c_str(), params.prompt.c_str(), n_parallel, n_len, n_gpu_layers, params.n_threads, params.mlp_adapter.c_str());
    if (params.prompt.empty()) {
        params.prompt = "Hello my name is";
    }
@ -65,6 +76,21 @@ int main(int argc, char ** argv) {
        return 1;
    }
    if (!params.mlp_adapter.empty()) {
        int err = llama_model_apply_mlp_from_file(model, params.mlp_adapter.c_str(), true);
        if (err != 0) {
            fprintf(stderr, "%s: error: failed to apply mlp adapter\n", __func__);
            llama_free_model(model);
            return 1;
        }
    }
    if (llama_model_apply_augmentation(model) != 0) {
        fprintf(stderr, "%s: error: failed to apply model augmentation\n", __func__);
        llama_free_model(model);
        return 1;
    }
    // tokenize the prompt
    std::vector<llama_token> tokens_list;
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -29,7 +29,11 @@ GGML_API void   ggml_cuda_host_free(void * ptr);
 GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
 GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_alloc_tensor(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_cpy_1d(struct ggml_tensor * dst, const struct ggml_tensor * src);
 GGML_API bool   debug_equal(short *a, short *b);
 GGML_API void **ggml_cuda_get_data_pp(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
 GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -2423,6 +2423,78 @@ static inline __m128i get_scale_shuffle(int i) {
 }
 #endif
 void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale) {
    const int qk = QK8_0;
    const int nb = n / qk;
    assert(n % qk == 0);
    assert(nb % 2 == 0);
    const block_q4_0 * restrict x = vx;
    // Initialize accumulator with zeros
    __m256 acc = _mm256_setzero_ps();
    __m256i alpha_v = _mm256_set1_epi16((short)alpha);
    // Main loop
    for (int i = 0; i < nb; ++i) {
        /* Compute combined scale for the block */
        const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(x[i].d) * GGML_FP16_TO_FP32(scale) );
        __m256i bx = bytes_from_nibbles_32(x[i].qs);
        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
        const __m256i off = _mm256_set1_epi8( 8 );
        bx = _mm256_sub_epi8( bx, off );
        //16个数计算
        __m128i m_a = _mm256_extracti128_si256(bx, 0);
        __m256i m_x = _mm256_cvtepi8_epi16(m_a); //16 elements
        m_x = _mm256_mullo_epi16(m_x, alpha_v);
        __m128i x_0 = _mm256_extracti128_si256(m_x, 0);
        __m256i x0_32 = _mm256_cvtepi16_epi32(x_0);
        __m256 fx0 = _mm256_cvtepi32_ps(x0_32);
        fx0 = _mm256_mul_ps(fx0, d);
        __m256 by = _mm256_loadu_ps((const __m256 *)(vy+i*128));
        by = _mm256_add_ps(by, fx0);
        _mm256_storeu_ps((__m256*)(vz + i*128), by);
        //second phase
        x_0 = _mm256_extracti128_si256(m_x, 1);
        x0_32 = _mm256_cvtepi16_epi32(x_0);
        fx0 = _mm256_cvtepi32_ps(x0_32);
        fx0 = _mm256_mul_ps(fx0, d);
        by = _mm256_loadu_ps((const __m256 *)(vy+i*128+32));
        by = _mm256_add_ps(by, fx0);
        _mm256_storeu_ps((__m256*)(vz + i*128+32), by);
        //third phase
        m_a = _mm256_extracti128_si256(bx, 1);
        m_x = _mm256_cvtepi8_epi16(m_a);
        m_x = _mm256_mullo_epi16(m_x, alpha_v);
        x_0 = _mm256_extracti128_si256(m_x, 0);
        x0_32 = _mm256_cvtepi16_epi32(x_0);
        fx0 = _mm256_cvtepi32_ps(x0_32);
        fx0 = _mm256_mul_ps(fx0, d);
        by = _mm256_loadu_ps((const __m256 *)(vy+i*128+64));
        by = _mm256_add_ps(by, fx0);
        _mm256_storeu_ps((__m256*)(vz + i*128+64), by);
        //fourth phase
        x_0 = _mm256_extracti128_si256(m_x, 1);
        x0_32 = _mm256_cvtepi16_epi32(x_0);
        fx0 = _mm256_cvtepi32_ps(x0_32);
        fx0 = _mm256_mul_ps(fx0, d);
        by = _mm256_loadu_ps((const __m256 *)(vy+i*128+96));
        by = _mm256_add_ps(by, fx0);
        _mm256_storeu_ps((__m256*)(vz + i*128+96), by);
    }
 }
 void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
    const int qk = QK8_0;
    const int nb = n / qk;
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -210,6 +210,8 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int
 void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
 void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
 void ggml_axpy_q4_0_q8_0(const int n, const void * restrict vx, const void * restrict vy, const void * restrict vz, int8_t alpha, ggml_fp16_t scale);
 // Dot product
 void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -207,6 +207,14 @@
 #include <stdint.h>
 #include <stddef.h>
 #include <stdbool.h>
 #ifdef __cplusplus
  #include <atomic>
  using std::atomic_int;
  using std::memory_order;
  using std::memory_order_acquire;
 #else /* not __cplusplus */
  #include <stdatomic.h>
 #endif /* __cplusplus */
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 #define GGML_FILE_VERSION 1
@ -232,6 +240,7 @@
 #define GGML_EXIT_ABORTED 1
 #define GGUF_MAGIC "GGUF"
 #define GGUF_POWERINFER_MAGIC "PWRI"
 #define GGUF_VERSION 3
@ -336,6 +345,11 @@ extern "C" {
        GGML_BACKEND_GPU_SPLIT = 20,
    };
    enum ggml_sparse_deriv {
        GGML_DENSE_INFERENCE = 0,
        GGML_SPARSE_INFERENCE = 1,
    };
    // model file types
    enum ggml_ftype {
        GGML_FTYPE_UNKNOWN     = -1,
@ -382,6 +396,7 @@ extern "C" {
        GGML_OP_GROUP_NORM,
        GGML_OP_MUL_MAT,
        GGML_OP_AXPY,
        GGML_OP_OUT_PROD,
        GGML_OP_SCALE,
@ -504,6 +519,7 @@ extern "C" {
        struct ggml_tensor * src[GGML_MAX_SRC];
        // performance
        atomic_int is_finish;
        int     perf_runs;
        int64_t perf_cycles;
        int64_t perf_time_us;
@ -520,6 +536,9 @@ extern "C" {
        char padding[12];
    };
    static const int64_t GGML_NE_WILDCARD = -1;
    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
    // the compute plan that needs to be prepared for ggml_graph_compute()
@ -573,6 +592,22 @@ extern "C" {
        void * data;
    };
    struct ggml_context {
        size_t mem_size;
        void * mem_buffer;
        bool   mem_buffer_owned;
        bool   no_alloc;
        bool   no_alloc_save; // this is used to save the no_alloc state when using scratch buffers
        int    n_objects;
        struct ggml_object * objects_begin;
        struct ggml_object * objects_end;
        struct ggml_scratch scratch;
        struct ggml_scratch scratch_save;
    };
    struct ggml_init_params {
        // memory pool
        size_t mem_size;   // bytes
@ -600,6 +635,7 @@ extern "C" {
        // work buffer for all threads
        size_t wsize;
        void * wdata;
        atomic_int *aic;
    };
    // misc
@ -618,6 +654,8 @@ extern "C" {
    GGML_API void    ggml_print_object (const struct ggml_object * obj);
    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
    GGML_API 
    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
@ -727,6 +765,7 @@ extern "C" {
    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
    GGML_API int32_t * ggml_get_data_i32(const struct ggml_tensor * tensor);
    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
@ -735,6 +774,9 @@ extern "C" {
    GGML_ATTRIBUTE_FORMAT(2, 3)
    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);
    GGML_API void ggml_set_backend(struct ggml_tensor * tensor, enum ggml_backend_type backend);
    //
    // operations on tensors with backpropagation
    //
@ -753,6 +795,12 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    GGML_API struct ggml_tensor *ggml_add_idx(
            struct ggml_context *ctx,
            struct ggml_tensor *a,
            struct ggml_tensor *b,
            struct ggml_tensor *idx);
    GGML_API struct ggml_tensor * ggml_add_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1027,6 +1075,25 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    GGML_API struct ggml_tensor *ggml_mul_mat_idx(
            struct ggml_context *ctx,
            struct ggml_tensor *a,
            struct ggml_tensor *b,
            struct ggml_tensor *idx,
            struct ggml_tensor *d);
    GGML_API struct ggml_tensor *ggml_mul_mat_special(
            struct ggml_context *ctx,
            struct ggml_tensor *a,
            struct ggml_tensor *b,
            struct ggml_tensor *idx,
            struct ggml_tensor *d,
            struct ggml_tensor *ref);
    GGML_API struct ggml_tensor *ggml_axpy(
            struct ggml_context *ctx,
            struct ggml_tensor *a,
            struct ggml_tensor *b,
            struct ggml_tensor *c,
            struct ggml_tensor *d);
    // A: m columns, n rows,
    // B: p columns, n rows,
@ -2013,6 +2080,7 @@ extern "C" {
    };
    GGML_API struct gguf_context * gguf_init_empty(void);
    GGML_API struct gguf_context * gguf_init_empty_sparse(void);
    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
@ -2049,6 +2117,7 @@ extern "C" {
    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
    GGML_API enum ggml_sparse_deriv gguf_get_sparse_deriv(const struct gguf_context * ctx);
    GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
    GGML_API int    gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
    GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -115,6 +115,10 @@ class MODEL_TENSOR(IntEnum):
    FFN_NORM        = auto()
    ATTN_Q_NORM     = auto()
    ATTN_K_NORM     = auto()
    FFN_DOWN_T      = auto()
    FC_1            = auto()
    FC_2            = auto()
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -155,6 +159,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.FFN_GATE:        "blk.{bid}.ffn_gate",
    MODEL_TENSOR.FFN_DOWN:        "blk.{bid}.ffn_down",
    MODEL_TENSOR.FFN_UP:          "blk.{bid}.ffn_up",
    MODEL_TENSOR.FFN_DOWN_T:      "blk.{bid}.ffn_down_t",
    MODEL_TENSOR.FC_1:            "blk.{bid}.fc1",
    MODEL_TENSOR.FC_2:            "blk.{bid}.fc2",
 }
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -173,6 +180,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_GATE,
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
        MODEL_TENSOR.FFN_DOWN_T,
        MODEL_TENSOR.FC_1,
        MODEL_TENSOR.FC_2,
    ],
    MODEL_ARCH.GPTNEOX: [
        MODEL_TENSOR.TOKEN_EMBD,
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -194,6 +194,14 @@ class TensorNameMap:
        MODEL_TENSOR.ROPE_FREQS: (
            "language_model.encoder.layers.{bid}.self_attention.rotary_emb.inv_freq",  # persimmon
        ),
        MODEL_TENSOR.FC_1: (
            "model.layers.{bid}.fc1",
        ),
        MODEL_TENSOR.FC_2: (
            "model.layers.{bid}.fc2",
        ),
    }
    mapping: dict[str, tuple[MODEL_TENSOR, str]]
--- a/llama.cpp
+++ b/llama.cpp
--- a/llama.h
+++ b/llama.h
@ -293,6 +293,7 @@ extern "C" {
    LLAMA_API int llama_n_ctx      (const struct llama_context * ctx);
    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
    LLAMA_API bool llama_use_sparse_inference(const struct llama_model * model);
    LLAMA_API int llama_n_vocab    (const struct llama_model * model);
    LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
@ -340,6 +341,13 @@ extern "C" {
                      const char * path_base_model,
                             int   n_threads);
    LLAMA_API int llama_model_apply_mlp_from_file(
                  struct llama_model * model,
                          const char * path_mlp,
                                bool   use_mmap);
    LLAMA_API int llama_model_apply_augmentation(struct llama_model * model);
    //
    // KV cache
    //
--- a/scripts/export-gpu-split.py
+++ b/scripts/export-gpu-split.py
@ -0,0 +1,142 @@
 #!/usr/bin/env python3
 import argparse
 import torch
 import torch.nn as tnn
 from pathlib import Path
 import os
 import re
 import struct
 from typing import Any, BinaryIO
 import numpy as np
 import pickle
 class ReluMLP(tnn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(ReluMLP, self).__init__()
        self.fc1 = tnn.Linear(input_dim, hidden_dim, bias=False)
        self.relu = tnn.ReLU()
        self.fc2 = tnn.Linear(hidden_dim, output_dim, bias=False)
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x
 def _load_mlp_model(model_file: Path):
    model = torch.load(model_file)
    # hidden_size, input_size = model.get("fc1.weight").shape
    # output_size, _ = model.get("fc2.weight").shape
    # mlp = ReluMLP(input_size, hidden_size, output_size)
    # mlp.load_state_dict(model)
    return model
 def load_mlp_predictors(models_base: Path):
    # TODO: might need a specification file to indicate which models to load.
    # But for now, let's assume it is a plain directory of models_{0, ... , n_layers - 1}.pt
    *_, files = next(os.walk(models_base))
    return [_load_mlp_model(models_base / f"activation_{i}.pt") for i in range(len(files))]
 def write_file_header(fout: BinaryIO, n_tensors: int) -> None:
    fout.write(b"gglp"[::-1])  # magic (GGml mLP)
    fout.write(struct.pack("i", 1))  # file version
    # TODO: If we found we need more common parameters, we can add them here.
    fout.write(struct.pack("i", n_tensors))
 def write_tensor_header(
    fout: BinaryIO, key: str, shape: tuple[int, ...], dtype: np.dtype
 ) -> None:
    _NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1, "int32": 18}
    bkey = key.encode("utf-8")
    fout.write(
        struct.pack("iii", len(shape), len(bkey), _NUMPY_TYPE_TO_FTYPE[dtype.name])
    )
    fout.write(struct.pack("i" * len(shape), *shape))
    fout.write(bkey)
    # Aligns to 32 bytes
    fout.seek((fout.tell() + 31) & -32)
 # TODO: need to add more details in key name to indicate the network, layer number, etc.
 def _translate_mlp_key(key: str) -> str:
    match = re.match(r"^(fc\d+).weight$", key)
    if not match or len(match.groups()) != 1:
        raise ValueError(f"Unexpected key: {key}")
    return f"{match.group(1)}.weight.mlp"
 def append_mlp_model(fout: BinaryIO, model: ReluMLP) -> None:
    model_dict = model.state_dict()
    for k, v in model_dict.items():
        key = _translate_mlp_key(k)
        # torch.nn.Linear stores the weight matrix as (output_dim, input_dim), so does GGML.
        weights = v.half().detach().numpy()
        # GGML stores the weight matrix as (input_dim, output_dim)
        dims = weights.shape[::-1]
        print(
            f"{k} => {key} {weights.shape} {weights.dtype} {weights.nbytes/1024/1024} MiB"
        )
        # TODO: add option to write in float32
        write_tensor_header(fout, key, dims, np.dtype("float16"))
        weights.tofile(fout)
 def append_gpu_idx(fout: BinaryIO, activation, select_count) -> None:
    values, indices = torch.topk(activation, k=int(select_count))
    gpu_idx = torch.zeros_like(activation)
    gpu_idx[indices] = 1
    gpu_idx = gpu_idx.numpy().astype(np.int32)
    weights = gpu_idx
    dims = gpu_idx.shape[::-1]
    key = "gpu_idx"
    print(
        f"{key} => {key} {weights.shape} {weights.dtype} {weights.nbytes/1024/1024} MiB"
    )
    write_tensor_header(fout, key, dims, np.dtype("int32"))
    weights.tofile(fout)
    indices = indices.numpy().astype(np.int32)
    weights = indices
    dims = weights.shape[::-1]
    key = "gpu_bucket"
    print(
        f"{key} => {key} {weights.shape} {weights.dtype} {weights.nbytes/1024/1024} MiB"
    )
    write_tensor_header(fout, key, dims, np.dtype("int32"))
    weights = np.sort(weights)
    weights.tofile(fout)
 def main(predictors_path: str, output_path: str, solver_path: str):
    predictors = load_mlp_predictors(Path(predictors_path)) # predictor => activation acount
    n_tensors = len(predictors) * 2 # gpu_idx and gpu_bucket
    print(f"found {len(predictors)} MLP adapters with {n_tensors} tensors")
    with open(solver_path, "rb") as f:
        loaded_lst = pickle.load(f)
        # print(f"check solver {loaded_lst}")
    with open(output_path, "wb") as fout:
        fout.truncate()
        write_file_header(fout, n_tensors=n_tensors)
        for i, activation in enumerate(predictors):
            print(f"appending gpu idx layer-{i}")
            append_gpu_idx(fout, activation, loaded_lst[i])
            # append_gpu_idx(fout, activation, (32768*0.0))
    print(f"converted MLP adapters from {predictors_path} to {output_path}")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("predictors_path", help="path to the MLP predictors")
    parser.add_argument(
        "output_path",
        help="path to the output GGML adapter",
        default="./ggml-mlp-adapters.bin",
    )
    parser.add_argument("solver", help="path to the solver")
    args = parser.parse_args()
    main(args.predictors_path, args.output_path, args.solver)