Merge branch 'master' of github.com:psugihara/llama.cpp into swiftui-timings
This commit is contained in:
commit
0f29c0247f
11 changed files with 724 additions and 19 deletions
|
@ -103,6 +103,7 @@ as the main playground for developing new features for the [ggml](https://github
|
|||
- [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
|
||||
- [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral)
|
||||
- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
|
||||
- [x] [GPT-2](https://huggingface.co/gpt2)
|
||||
|
||||
**Multimodal models:**
|
||||
|
||||
|
|
116
awq-py/README.md
Normal file
116
awq-py/README.md
Normal file
|
@ -0,0 +1,116 @@
|
|||
# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
|
||||
[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
|
||||
|
||||
**Supported models:**
|
||||
|
||||
- [X] LLaMA
|
||||
- [x] LLaMA 2
|
||||
- [X] MPT
|
||||
- [X] Mistral AI v0.1
|
||||
- [ ] Bloom
|
||||
- [ ] Mixtral MoE
|
||||
|
||||
**TODO:**
|
||||
- [x] Update version work with both MPT and MPT-AWQ model
|
||||
- [ ] Add OPT model
|
||||
- [ ] Add Bloom model
|
||||
- [ ] Add Mixtral MoE
|
||||
- [ ] Support w3, w2
|
||||
|
||||
|
||||
## Contents
|
||||
|
||||
- [Install](##Install)
|
||||
- [Convert](##Convert)
|
||||
- [Quantize](##Quantize)
|
||||
- [Test](##Test)
|
||||
- [Benchmark](##Benchmark)
|
||||
- [Results](##Results)
|
||||
|
||||
## Install
|
||||
Install requirements
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
|
||||
```bash
|
||||
git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
|
||||
```
|
||||
|
||||
## Convert
|
||||
Example for llama model
|
||||
```bash
|
||||
# For llama7b and llama2 models
|
||||
python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
|
||||
# For mistral and mpt models
|
||||
python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
|
||||
```
|
||||
|
||||
## Quantize
|
||||
```bash
|
||||
# We only benchmark and confirm the results on q4_0, q4_1, and q2_k types.
|
||||
./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
|
||||
```
|
||||
|
||||
## Test
|
||||
```bash
|
||||
# For all models.
|
||||
./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time"
|
||||
```
|
||||
|
||||
## Benchmark
|
||||
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
|
||||
```bash
|
||||
# For llama and llama2, and mistral models.
|
||||
./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
|
||||
```
|
||||
|
||||
## Results
|
||||
Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
|
||||
We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
|
||||
|
||||
### Llama 7B (Build with OpenBLAS)
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||
|-----------:|--------------|-------:|-------:|-------:|-------:|
|
||||
|Llama 7B | perplexity | 5.9066 | 6.1214 | 6.0643 | 6.5808 |
|
||||
|Llama 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G |
|
||||
|Llama 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||
|AWQ-LLama 7B| perplexity | 5.9175 | 6.0252 | 5.9987 | 6.3692 |
|
||||
|AWQ-LLama 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G |
|
||||
|AWQ-LLama 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||
|
||||
|
||||
### Llama2 7B (Build with CuBLAS)
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||
|------------:|--------------|-------:|-------:|-------:|-------:|
|
||||
|Llama2 7B | perplexity | 5.8664 | 6.0260 | 6.0656 | 6.4496 |
|
||||
|Llama2 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G |
|
||||
|Llama2 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||
|AWQ-LLama2 7B| perplexity | 5.8801 | 6.0054 | 5.9849 | 6.3650 |
|
||||
|AWQ-LLama2 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G |
|
||||
|AWQ-LLama2 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||
|
||||
|
||||
### Mistral 7B v0.1 (Build with CuBLAS)
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||
|-------------:|--------------|-------:|-------:|-------:|-------:|
|
||||
|Mistral 7B | perplexity | 5.6931 | 5.8202 | 5.8268 | 6.1645 |
|
||||
|Mistral 7B | file size | 14.5G | 4.1G | 4.5G | 3.1G |
|
||||
|Mistral 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||
|AWQ-Mistral 7B| perplexity | 5.6934 | 5.8020 | 5.7691 | 6.0426 |
|
||||
|AWQ-Mistral 7B| file size | 14.5G | 4.1G | 4.5G | 3.1G |
|
||||
|AWQ-Mistral 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||
|
||||
### MPT 7B (Build with OpenBLAS)
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||
|---------:|--------------|-------:|-------:|-------:|--------:|
|
||||
|MPT 7B | perplexity | 8.4369 | 8.7956 | 8.6265 | 11.4913 |
|
||||
|MPT 7B | file size | 13.7G | 3.9G | 4.3G | 2.8G |
|
||||
|MPT 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||
|AWQ-MPT 7B| perplexity | 8.4944 | 8.7053 | 8.6750 | 10.2873|
|
||||
|AWQ-MPT 7B| file size | 13.7G | 3.9G | 4.3G | 2.8G |
|
||||
|AWQ-MPT 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
254
awq-py/awq/apply_awq.py
Normal file
254
awq-py/awq/apply_awq.py
Normal file
|
@ -0,0 +1,254 @@
|
|||
"""
|
||||
Implements the AWQ for llama.cpp use cases.
|
||||
Original paper: https://arxiv.org/abs/2306.00978
|
||||
|
||||
This code is based on versions of the AWQ implementation found in the following repositories:
|
||||
* https://github.com/mit-han-lab/llm-awq
|
||||
* https://github.com/casper-hansen/AutoAWQ
|
||||
"""
|
||||
|
||||
import os
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from transformers import AutoModelForCausalLM, AutoConfig
|
||||
from transformers.models.bloom.modeling_bloom import BloomGelu
|
||||
from transformers.models.llama.modeling_llama import LlamaRMSNorm
|
||||
from transformers.activations import GELUActivation
|
||||
|
||||
|
||||
class ScaledActivation(nn.Module):
|
||||
"""
|
||||
ScaledActivation module wraps an existing activation function and applies a
|
||||
scale factor to its output.
|
||||
|
||||
Args:
|
||||
module (nn.Module): The activation function to be scaled.
|
||||
scales (torch.Tensor): A tensor of size (num_features,) containing the initial
|
||||
scale factors for each feature.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: The scaled output of the activation function.
|
||||
"""
|
||||
|
||||
def __init__(self, module, scales):
|
||||
super().__init__()
|
||||
self.act = module
|
||||
self.scales = nn.Parameter(scales.data)
|
||||
|
||||
def forward(self, x):
|
||||
return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
|
||||
|
||||
|
||||
def set_op_by_name(layer, name, new_module):
|
||||
"""
|
||||
Set the new module for given module's name.
|
||||
|
||||
Args:
|
||||
layer (nn.Module): The layer in which to replace the submodule.
|
||||
name (str): The path to the submodule to be replaced, using dot notation
|
||||
to access nested modules.
|
||||
new_module (nn.Module): The new module to replace the existing one.
|
||||
"""
|
||||
levels = name.split(".")
|
||||
if len(levels) > 1:
|
||||
mod_ = layer
|
||||
for l_idx in range(len(levels) - 1):
|
||||
if levels[l_idx].isdigit():
|
||||
mod_ = mod_[int(levels[l_idx])]
|
||||
else:
|
||||
mod_ = getattr(mod_, levels[l_idx])
|
||||
setattr(mod_, levels[-1], new_module)
|
||||
else:
|
||||
setattr(layer, name, new_module)
|
||||
|
||||
|
||||
def get_op_by_name(module, op_name):
|
||||
"""
|
||||
Retrieves a submodule within a given layer based on its name.
|
||||
|
||||
Args:
|
||||
module (nn.Module): The layer containing the submodule to find.
|
||||
op_name (str): The name of the submodule.
|
||||
|
||||
Returns:
|
||||
nn.Module: The requested submodule found within the given layer.
|
||||
|
||||
Raises:
|
||||
ValueError: If the specified submodule cannot be found within the layer.
|
||||
"""
|
||||
for name, m in module.named_modules():
|
||||
if name == op_name:
|
||||
return m
|
||||
raise ValueError(f"Cannot find op {op_name} in module {module}")
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def scale_ln_fcs(ln, fcs, scales):
|
||||
"""
|
||||
Scales the weights of a LayerNorm and a list of fully-connected layers proportionally.
|
||||
|
||||
Args:
|
||||
ln (nn.LayerNorm): The LayerNorm module to be scaled.
|
||||
fcs (List[nn.Linear]): A list of fully-connected layers to be scaled.
|
||||
scales (torch.Tensor): A 1D tensor of size (num_features,).
|
||||
"""
|
||||
|
||||
if not isinstance(fcs, list):
|
||||
fcs = [fcs]
|
||||
|
||||
scales = scales.to(ln.weight.device)
|
||||
|
||||
ln.weight.div_(scales)
|
||||
if hasattr(ln, "bias") and ln.bias is not None:
|
||||
ln.bias.div_(scales)
|
||||
|
||||
for fc in fcs:
|
||||
fc.weight.mul_(scales.view(1, -1))
|
||||
|
||||
for p in ln.parameters():
|
||||
assert torch.isnan(p).sum() == 0
|
||||
for fc in fcs:
|
||||
for p in fc.parameters():
|
||||
assert torch.isnan(p).sum() == 0
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def scale_fc_fc(fc1, fc2, scales):
|
||||
"""
|
||||
Scales the weights of two fully-connected layers in a specific pattern.
|
||||
|
||||
Args:
|
||||
fc1 (nn.Linear): The first fully-connected layer to be scaled.
|
||||
fc2 (nn.Linear): The second fully-connected layer to be scaled.
|
||||
scales (torch.Tensor): A 1D tensor of size (num_features,).
|
||||
"""
|
||||
assert isinstance(fc1, nn.Linear)
|
||||
assert isinstance(fc2, nn.Linear)
|
||||
|
||||
scales = scales.to(fc1.weight.device)
|
||||
|
||||
fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
|
||||
if fc1.bias is not None:
|
||||
fc1.bias.div_(scales.view(-1))
|
||||
|
||||
fc2.weight.mul_(scales.view(1, -1))
|
||||
|
||||
for p in fc1.parameters():
|
||||
assert torch.isnan(p).sum() == 0
|
||||
for p in fc2.parameters():
|
||||
assert torch.isnan(p).sum() == 0
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def scale_gelu_fc(gelu, fc, scales):
|
||||
"""
|
||||
Scales the weight of a GELU activation and a fully-connected layer proportionally.
|
||||
|
||||
Args:
|
||||
gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled.
|
||||
fc (nn.Linear): The fully-connected layer to be scaled.
|
||||
scales (torch.Tensor): A 1D tensor of size (num_features,).
|
||||
|
||||
Raises:
|
||||
TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`.
|
||||
TypeError: If the `fc` module is not of type `nn.Linear`.
|
||||
"""
|
||||
assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
|
||||
assert isinstance(fc, nn.Linear)
|
||||
|
||||
fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
|
||||
|
||||
for p in fc.parameters():
|
||||
assert torch.isnan(p).sum() == 0
|
||||
|
||||
|
||||
def apply_scale(module, scales_list, input_feat_dict=None):
|
||||
"""
|
||||
Applies different scaling strategies to layers based on their type and hierarchy within a given module.
|
||||
|
||||
Args:
|
||||
module (nn.Module): The module containing the layers to be scaled.
|
||||
scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing:
|
||||
* prev_op_name (str): The name of the preceding operation or module,
|
||||
relative to which the layers to be scaled are located.
|
||||
* layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation.
|
||||
* scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
|
||||
input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding
|
||||
input features (optional).
|
||||
"""
|
||||
for prev_op_name, layer_names, scales in scales_list:
|
||||
prev_op = get_op_by_name(module, prev_op_name)
|
||||
layers = [get_op_by_name(module, name) for name in layer_names]
|
||||
|
||||
prev_op.cuda()
|
||||
for layer in layers:
|
||||
layer.cuda()
|
||||
scales.cuda()
|
||||
|
||||
if isinstance(prev_op, nn.Linear):
|
||||
assert len(layers) == 1
|
||||
scale_fc_fc(prev_op, layers[0], scales)
|
||||
elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower():
|
||||
scale_ln_fcs(prev_op, layers, scales)
|
||||
elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
|
||||
new_module = ScaledActivation(prev_op, scales)
|
||||
set_op_by_name(module, prev_op_name, new_module)
|
||||
scale_gelu_fc(prev_op, layers[0], scales)
|
||||
else:
|
||||
raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
|
||||
|
||||
# apply the scaling to input feat if given; prepare it for clipping
|
||||
if input_feat_dict is not None:
|
||||
for layer_name in layer_names:
|
||||
inp = input_feat_dict[layer_name]
|
||||
inp.div_(scales.view(1, -1).to(inp.device))
|
||||
|
||||
prev_op.cpu()
|
||||
for layer in layers:
|
||||
layer.cpu()
|
||||
scales.cpu()
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def apply_clip(module, clip_list):
|
||||
"""
|
||||
Applies element-wise clipping to the weight of a specific layer within a given module.
|
||||
|
||||
Args:
|
||||
module (nn.Module): The module containing the layer to be clipped.
|
||||
clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing:
|
||||
* name (str): The name of the layer to be clipped, relative to the root of the module.
|
||||
* max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight.
|
||||
"""
|
||||
for name, max_val in clip_list:
|
||||
layer = get_op_by_name(module, name)
|
||||
layer.cuda()
|
||||
max_val = max_val.to(layer.weight.device)
|
||||
org_shape = layer.weight.shape
|
||||
layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
|
||||
layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
|
||||
layer.weight.data = layer.weight.data.reshape(org_shape)
|
||||
layer.cpu()
|
||||
|
||||
|
||||
def add_scale_weights(model_path, scale_path, tmp_path):
|
||||
"""
|
||||
Adds pre-computed Activation Weight Quantization (AWQ) results to a model,
|
||||
including scaling factors and clipping bounds.
|
||||
|
||||
Args:
|
||||
model_path (str): Path to the pre-trained model to be equipped with AWQ.
|
||||
scale_path (str): Path to the AWQ scale factors (.pt file).
|
||||
tmp_path (str): Path to the temporary directory where the equipped model will be saved.
|
||||
"""
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path, config=config, trust_remote_code=True
|
||||
)
|
||||
model.eval()
|
||||
awq_results = torch.load(str(scale_path), map_location="cpu")
|
||||
apply_scale(model, awq_results["scale"])
|
||||
apply_clip(model, awq_results["clip"])
|
||||
model.save_pretrained(str(tmp_path))
|
||||
os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")
|
2
awq-py/requirements.txt
Normal file
2
awq-py/requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
torch>=2.0.0
|
||||
transformers>=4.32.0
|
|
@ -46,7 +46,7 @@ class Model:
|
|||
self.part_names = self._get_part_names()
|
||||
self.hparams = Model.load_hparams(self.dir_model)
|
||||
self.model_arch = self._get_model_architecture()
|
||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess)
|
||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_gpt2()
|
||||
|
@ -59,7 +59,7 @@ class Model:
|
|||
from safetensors import safe_open
|
||||
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
||||
else:
|
||||
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
|
||||
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", weights_only=True))
|
||||
|
||||
with ctx as model_part:
|
||||
for name in model_part.keys():
|
||||
|
@ -182,6 +182,8 @@ class Model:
|
|||
return QwenModel
|
||||
if model_architecture == "MixtralForCausalLM":
|
||||
return MixtralModel
|
||||
if model_architecture == "GPT2LMHeadModel":
|
||||
return GPT2Model
|
||||
if model_architecture == "PhiForCausalLM":
|
||||
return Phi2Model
|
||||
if model_architecture == "PlamoForCausalLM":
|
||||
|
@ -225,6 +227,8 @@ class Model:
|
|||
return gguf.MODEL_ARCH.QWEN
|
||||
if arch == "MixtralForCausalLM":
|
||||
return gguf.MODEL_ARCH.LLAMA
|
||||
if arch == "GPT2LMHeadModel":
|
||||
return gguf.MODEL_ARCH.GPT2
|
||||
if arch == "PhiForCausalLM":
|
||||
return gguf.MODEL_ARCH.PHI2
|
||||
if arch == "PlamoForCausalLM":
|
||||
|
@ -464,7 +468,11 @@ class MPTModel(Model):
|
|||
data = data_torch.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if "scales" in name:
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
|
||||
new_name = new_name.replace("scales", "act.scales")
|
||||
else:
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
|
@ -989,6 +997,68 @@ class QwenModel(Model):
|
|||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
class GPT2Model(Model):
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_block_count(self.hparams["n_layer"])
|
||||
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
||||
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
|
||||
def write_tensors(self):
|
||||
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||
|
||||
for name, data_torch in self.get_tensors():
|
||||
# we don't need these
|
||||
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias")):
|
||||
continue
|
||||
|
||||
if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")):
|
||||
data_torch = data_torch.transpose(1, 0)
|
||||
|
||||
old_dtype = data_torch.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
data = data_torch.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if self.ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
# note: GPT2 output is tied to (same as) wte in original model
|
||||
if new_name == "token_embd.weight":
|
||||
print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
|
||||
|
||||
class Phi2Model(Model):
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["n_layer"]
|
||||
|
@ -1095,6 +1165,9 @@ def parse_args() -> argparse.Namespace:
|
|||
"--vocab-only", action="store_true",
|
||||
help="extract only the vocab",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--awq-path", type=Path, default=None,
|
||||
help="Path to scale awq cache file")
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path,
|
||||
help="path to write to; default: based on input",
|
||||
|
@ -1115,6 +1188,20 @@ def parse_args() -> argparse.Namespace:
|
|||
args = parse_args()
|
||||
|
||||
dir_model = args.model
|
||||
|
||||
if args.awq_path:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
||||
from awq.apply_awq import add_scale_weights
|
||||
tmp_model_path = args.model / "weighted_model"
|
||||
dir_model = tmp_model_path
|
||||
if tmp_model_path.is_dir():
|
||||
print(f"{tmp_model_path} exists as a weighted model.")
|
||||
else:
|
||||
tmp_model_path.mkdir(parents=True, exist_ok=True)
|
||||
print("Saving new weighted model ...")
|
||||
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
||||
print(f"Saved weighted model at {tmp_model_path}.")
|
||||
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
|
14
convert.py
14
convert.py
|
@ -1187,6 +1187,7 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
# We currently only support Q8_0 output on little endian systems.
|
||||
output_choices.append("q8_0")
|
||||
parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file")
|
||||
parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None)
|
||||
parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model")
|
||||
parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file")
|
||||
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
|
||||
|
@ -1200,6 +1201,19 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
parser.add_argument("--padvocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||
|
||||
args = parser.parse_args(args_in)
|
||||
if args.awq_path:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
||||
from awq.apply_awq import add_scale_weights
|
||||
tmp_model_path = args.model / "weighted_model"
|
||||
if tmp_model_path.is_dir():
|
||||
print(f"{tmp_model_path} exists as a weighted model.")
|
||||
else:
|
||||
tmp_model_path.mkdir(parents=True, exist_ok=True)
|
||||
print("Saving new weighted model ...")
|
||||
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
||||
print(f"Saved weighted model at {tmp_model_path}.")
|
||||
args.model = tmp_model_path
|
||||
|
||||
if args.dump_single:
|
||||
model_plus = lazy_load_file(args.model)
|
||||
do_dump_model(model_plus)
|
||||
|
|
|
@ -120,6 +120,7 @@ class MODEL_TENSOR(IntEnum):
|
|||
FFN_GATE = auto()
|
||||
FFN_DOWN = auto()
|
||||
FFN_UP = auto()
|
||||
FFN_ACT = auto()
|
||||
FFN_GATE_EXP = auto()
|
||||
FFN_DOWN_EXP = auto()
|
||||
FFN_UP_EXP = auto()
|
||||
|
@ -169,6 +170,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}",
|
||||
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}",
|
||||
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}",
|
||||
|
@ -269,6 +271,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_ACT,
|
||||
],
|
||||
MODEL_ARCH.GPTJ: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
@ -367,7 +370,16 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.GPT2: [
|
||||
# TODO
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.POS_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.PHI2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
|
|
@ -17,6 +17,7 @@ class TensorNameMap:
|
|||
"tok_embeddings", # llama-pth
|
||||
"embeddings.word_embeddings", # bert
|
||||
"language_model.embedding.word_embeddings", # persimmon
|
||||
"wte", # gpt2
|
||||
"transformer.embd.wte", # phi2
|
||||
),
|
||||
|
||||
|
@ -34,6 +35,7 @@ class TensorNameMap:
|
|||
MODEL_TENSOR.POS_EMBD: (
|
||||
"transformer.wpe", # gpt2
|
||||
"embeddings.position_embeddings", # bert
|
||||
"wpe", # gpt2
|
||||
),
|
||||
|
||||
# Output
|
||||
|
@ -53,7 +55,7 @@ class TensorNameMap:
|
|||
"norm", # llama-pth
|
||||
"embeddings.LayerNorm", # bert
|
||||
"transformer.norm_f", # mpt
|
||||
"ln_f", # refact bloom qwen
|
||||
"ln_f", # refact bloom qwen gpt2
|
||||
"language_model.encoder.final_layernorm", # persimmon
|
||||
"lm_head.ln", # phi2
|
||||
),
|
||||
|
@ -78,6 +80,7 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||
"language_model.encoder.layers.{bid}.input_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln1", # yi
|
||||
"h.{bid}.ln_1", # gpt2
|
||||
"transformer.h.{bid}.ln", # phi2
|
||||
"model.layers.layers.{bid}.norm", # plamo
|
||||
),
|
||||
|
@ -95,6 +98,7 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||
"h.{bid}.self_attention.query_key_value", # bloom
|
||||
"language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon
|
||||
"h.{bid}.attn.c_attn", # gpt2
|
||||
"transformer.h.{bid}.mixer.Wqkv", # phi2
|
||||
),
|
||||
|
||||
|
@ -137,6 +141,7 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
|
||||
"h.{bid}.attn.c_proj", # gpt2
|
||||
"transformer.h.{bid}.mixer.out_proj", # phi2
|
||||
"model.layers.layers.{bid}.self_attn.o_proj", # plamo
|
||||
),
|
||||
|
@ -159,6 +164,7 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||
"language_model.encoder.layers.{bid}.post_attention_layernorm", # persimmon
|
||||
"model.layers.{bid}.ln2", # yi
|
||||
"h.{bid}.ln_2", # gpt2
|
||||
),
|
||||
|
||||
MODEL_TENSOR.FFN_GATE_INP: (
|
||||
|
@ -179,6 +185,7 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
|
||||
"transformer.h.{bid}.mlp.w1", # qwen
|
||||
"h.{bid}.mlp.c_fc", # gpt2
|
||||
"transformer.h.{bid}.mlp.fc1", # phi2
|
||||
"model.layers.layers.{bid}.mlp.up_proj", # plamo
|
||||
),
|
||||
|
@ -188,6 +195,11 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral
|
||||
),
|
||||
|
||||
# AWQ-activation gate
|
||||
MODEL_TENSOR.FFN_ACT: (
|
||||
"transformer.blocks.{bid}.ffn.act", # mpt
|
||||
),
|
||||
|
||||
# Feed-forward gate
|
||||
MODEL_TENSOR.FFN_GATE: (
|
||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||
|
@ -213,6 +225,7 @@ class TensorNameMap:
|
|||
"encoder.layer.{bid}.output.dense", # bert
|
||||
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
|
||||
"h.{bid}.mlp.c_proj", # gpt2
|
||||
"transformer.h.{bid}.mlp.fc2", # phi2
|
||||
"model.layers.layers.{bid}.mlp.down_proj", # plamo
|
||||
),
|
||||
|
|
233
llama.cpp
233
llama.cpp
|
@ -354,6 +354,7 @@ enum llm_tensor {
|
|||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_ACT,
|
||||
LLM_TENSOR_FFN_DOWN_EXP,
|
||||
LLM_TENSOR_FFN_GATE_EXP,
|
||||
LLM_TENSOR_FFN_UP_EXP,
|
||||
|
@ -422,6 +423,15 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|||
LLM_ARCH_GPT2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_POS_EMBD, "position_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -473,6 +483,7 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -1254,6 +1265,10 @@ enum e_model {
|
|||
MODEL_40B,
|
||||
MODEL_65B,
|
||||
MODEL_70B,
|
||||
MODEL_SMALL,
|
||||
MODEL_MEDIUM,
|
||||
MODEL_LARGE,
|
||||
MODEL_XL,
|
||||
};
|
||||
|
||||
static const size_t kiB = 1024;
|
||||
|
@ -1285,6 +1300,7 @@ struct llama_hparams {
|
|||
float f_clamp_kqv;
|
||||
float f_max_alibi_bias;
|
||||
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
if (this->vocab_only != other.vocab_only) return true;
|
||||
if (this->n_vocab != other.n_vocab) return true;
|
||||
|
@ -1388,6 +1404,7 @@ struct llama_layer {
|
|||
// ff bias
|
||||
struct ggml_tensor * ffn_down_b; // b2
|
||||
struct ggml_tensor * ffn_up_b; // b3
|
||||
struct ggml_tensor * ffn_act;
|
||||
};
|
||||
|
||||
struct llama_kv_cell {
|
||||
|
@ -2548,18 +2565,22 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|||
|
||||
static const char * llama_model_type_name(e_model type) {
|
||||
switch (type) {
|
||||
case MODEL_1B: return "1B";
|
||||
case MODEL_3B: return "3B";
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_8B: return "8B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_15B: return "15B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_40B: return "40B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
default: return "?B";
|
||||
case MODEL_1B: return "1B";
|
||||
case MODEL_3B: return "3B";
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_8B: return "8B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_15B: return "15B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_34B: return "34B";
|
||||
case MODEL_40B: return "40B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_70B: return "70B";
|
||||
case MODEL_SMALL: return "0.1B";
|
||||
case MODEL_MEDIUM: return "0.4B";
|
||||
case MODEL_LARGE: return "0.8B";
|
||||
case MODEL_XL: return "1.5B";
|
||||
default: return "?B";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2778,6 +2799,17 @@ static void llm_load_hparams(
|
|||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||
switch (hparams.n_layer) {
|
||||
case 12: model.type = e_model::MODEL_SMALL; break;
|
||||
case 24: model.type = e_model::MODEL_MEDIUM; break;
|
||||
case 36: model.type = e_model::MODEL_LARGE; break;
|
||||
case 48: model.type = e_model::MODEL_XL; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
|
||||
default: (void)0;
|
||||
}
|
||||
|
@ -3471,7 +3503,6 @@ static bool llm_load_tensors(
|
|||
case LLM_ARCH_MPT:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
|
||||
// output
|
||||
{
|
||||
ggml_backend_type backend_norm;
|
||||
|
@ -3509,6 +3540,9 @@ static bool llm_load_tensors(
|
|||
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
|
||||
// AWQ ScaleActivation layer
|
||||
layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_STABLELM:
|
||||
|
@ -3704,6 +3738,60 @@ static bool llm_load_tensors(
|
|||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
|
||||
|
||||
// output
|
||||
{
|
||||
ggml_backend_type backend_norm;
|
||||
ggml_backend_type backend_output;
|
||||
|
||||
if (n_gpu_layers > int(n_layer)) {
|
||||
backend_norm = llama_backend_offload;
|
||||
backend_output = llama_backend_offload_split;
|
||||
} else {
|
||||
backend_norm = GGML_BACKEND_CPU;
|
||||
backend_output = GGML_BACKEND_CPU;
|
||||
}
|
||||
|
||||
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
|
||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
||||
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
|
||||
|
||||
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
||||
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
|
||||
layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
throw std::runtime_error("unknown architecture");
|
||||
}
|
||||
|
@ -4039,6 +4127,7 @@ static struct ggml_tensor * llm_build_ffn(
|
|||
struct ggml_tensor * gate_b,
|
||||
struct ggml_tensor * down,
|
||||
struct ggml_tensor * down_b,
|
||||
struct ggml_tensor * act_scales,
|
||||
llm_ffn_op_type type_op,
|
||||
llm_ffn_gate_type type_gate,
|
||||
const llm_build_cb & cb,
|
||||
|
@ -4083,6 +4172,10 @@ static struct ggml_tensor * llm_build_ffn(
|
|||
{
|
||||
cur = ggml_gelu(ctx, cur);
|
||||
cb(cur, "ffn_gelu", il);
|
||||
if (act_scales != NULL) {
|
||||
cur = ggml_div(ctx, cur, act_scales);
|
||||
cb(cur, "ffn_act", il);
|
||||
}
|
||||
} break;
|
||||
case LLM_FFN_RELU:
|
||||
{
|
||||
|
@ -4401,6 +4494,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
} else {
|
||||
|
@ -4580,6 +4674,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -4694,6 +4789,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -4798,6 +4894,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -5002,6 +5099,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
NULL,
|
||||
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -5088,6 +5186,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -5183,6 +5282,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -5268,11 +5368,11 @@ struct llm_build_context {
|
|||
NULL,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
model.layers[il].ffn_act,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -5381,6 +5481,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -5493,6 +5594,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -5600,6 +5702,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(ffn_output, "ffn_out", il);
|
||||
}
|
||||
|
@ -5703,6 +5806,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -5732,6 +5836,102 @@ struct llm_build_context {
|
|||
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_gpt2() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * pos;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
||||
cb(inpL, "inp_embd", -1);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
||||
cb(inp_pos, "inp_pos", -1);
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||
cb(KQ_mask, "KQ_mask", -1);
|
||||
|
||||
pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos);
|
||||
cb(pos, "pos_embd", -1);
|
||||
|
||||
inpL = ggml_add(ctx0, inpL, pos);
|
||||
cb(inpL, "inpL", -1);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.layers[il].attn_norm,
|
||||
model.layers[il].attn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "attn_norm", il);
|
||||
|
||||
// self-attention
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
||||
cb(cur, "wqkv", il);
|
||||
|
||||
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
||||
cb(cur, "bqkv", il);
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
||||
|
||||
cb(Qcur, "Qcur", il);
|
||||
cb(Kcur, "Kcur", il);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
|
||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||
|
||||
cur = llm_build_kqv(ctx0, model, hparams, kv_self,
|
||||
model.layers[il].wo, model.layers[il].bo,
|
||||
Qcur, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
cb(cur, "kqv_out", il);
|
||||
}
|
||||
|
||||
// add the input
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// FF
|
||||
{
|
||||
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
||||
model.layers[il].ffn_norm,
|
||||
model.layers[il].ffn_norm_b,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
inpL = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(inpL, "l_out", il);
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||
model.output_norm,
|
||||
model.output_norm_b,
|
||||
LLM_NORM, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
|
@ -5887,6 +6087,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|||
{ "ffn_gate", OFFLOAD_FUNC },
|
||||
{ "ffn_gate_b", OFFLOAD_FUNC },
|
||||
{ "ffn_gate_par", OFFLOAD_FUNC },
|
||||
{ "ffn_act", OFFLOAD_FUNC },
|
||||
{ "ffn_down", OFFLOAD_FUNC },
|
||||
{ "ffn_down_b", OFFLOAD_FUNC },
|
||||
{ "ffn_out", OFFLOAD_FUNC },
|
||||
|
@ -6246,6 +6447,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|||
{
|
||||
result = llm.build_plamo();
|
||||
} break;
|
||||
case LLM_ARCH_GPT2:
|
||||
{
|
||||
result = llm.build_gpt2();
|
||||
} break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
|
BIN
models/ggml-vocab-gpt2.gguf
Normal file
BIN
models/ggml-vocab-gpt2.gguf
Normal file
Binary file not shown.
|
@ -41,6 +41,7 @@ llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cp
|
|||
llama_test_executable (test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||
llama_test_executable (test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||
llama_test_executable (test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||
llama_test_executable (test-tokenizer-1-gpt2 test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
|
||||
# llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
|
||||
|
||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue