update: support 4 models
This commit is contained in:
parent
e851199ad3
commit
eb9a790c11
7 changed files with 1257 additions and 92 deletions
|
@ -1,5 +1,15 @@
|
|||
# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
|
||||
[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
|
||||
[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
|
||||
|
||||
**Supported models:**
|
||||
|
||||
- [X] LLaMA 🦙
|
||||
- [x] LLaMA 2 🦙🦙
|
||||
- [X] MPT
|
||||
- [X] Mistral AI v0.1
|
||||
- [] Bloom
|
||||
- [] Mixtral MoE
|
||||
|
||||
|
||||
## Contents
|
||||
|
||||
|
@ -22,37 +32,26 @@ git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
|
|||
## Convert
|
||||
Example for llama 7b model
|
||||
```bash
|
||||
python convert-awq-hf-to-gguf.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --tmp-model-path models/llama-7b-scales --outfile models/llama_7b_fp16.gguf
|
||||
# For llama7b and llama27b models
|
||||
python examples/awqutils/convert-awq.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --tmp-model-path models/llama-7b-scales --outfile models/llama_7b_fp16.gguf
|
||||
```
|
||||
|
||||
## Quantize
|
||||
```bash
|
||||
./build/bin/quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
|
||||
./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
|
||||
```
|
||||
|
||||
## Benchmark
|
||||
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
|
||||
```bash
|
||||
./build/bin/perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
|
||||
./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
|
||||
```
|
||||
|
||||
## Results
|
||||
Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
|
||||
We use three types of llamacpp quantization methods to work with our version, including q4, q4_1, and q2_k
|
||||
|
||||
### Llama 7B
|
||||
Build with OpenBLAS
|
||||
|
||||
#### Memory/Disk Requirements
|
||||
|
||||
| Model | Original | AWQ-4bit |
|
||||
|------:|--------------:|--------------:|
|
||||
| fp16 | 12.853 GB | 12.853 GB |
|
||||
| q4_0 | 3.647 GB | 3.647 GB |
|
||||
| q4_1 | 4.041 GB | 4.041 GB |
|
||||
| q2_k | 2.649 GB | 2.649 GB |
|
||||
|
||||
#### Quantization
|
||||
|
||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||
### Llama 7B (Build with OpenBLAS)
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||
|-----------:|--------------|-------:|-------:|-------:|-------:|
|
||||
|
@ -68,21 +67,7 @@ Several quantization methods are supported. They differ in the resulting model d
|
|||
|AWQ-LLama 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||
|
||||
|
||||
### Llama2 7B
|
||||
Build with CuBLAS
|
||||
|
||||
#### Memory/Disk Requirements
|
||||
|
||||
| Model | Original | AWQ-4bit |
|
||||
|------:|--------------:|--------------:|
|
||||
| fp16 | 12.853 GB | 12.853 GB |
|
||||
| q4_0 | 3.647 GB | 3.647 GB |
|
||||
| q4_1 | 4.041 GB | 4.041 GB |
|
||||
| q2_k | 2.649 GB | 2.649 GB |
|
||||
|
||||
#### Quantization
|
||||
|
||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||
### Llama2 7B (Build with CuBLAS)
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||
|------------:|--------------|-------:|-------:|-------:|-------:|
|
||||
|
@ -98,21 +83,7 @@ Several quantization methods are supported. They differ in the resulting model d
|
|||
|AWQ-LLama2 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||
|
||||
|
||||
### Mistral 7B v0.1
|
||||
Build with CuBLAS
|
||||
|
||||
#### Memory/Disk Requirements
|
||||
|
||||
| Model | Original | AWQ-4bit |
|
||||
|------:|--------------:|--------------:|
|
||||
| fp16 | 12.853 GB | 12.853 GB |
|
||||
| q4_0 | 3.647 GB | 3.647 GB |
|
||||
| q4_1 | 4.041 GB | 4.041 GB |
|
||||
| q2_k | 2.649 GB | 2.649 GB |
|
||||
|
||||
#### Quantization
|
||||
|
||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||
### Mistral 7B v0.1 (Build with CuBLAS)
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||
|-------------:|--------------|-------:|-------:|-------:|-------:|
|
||||
|
@ -126,3 +97,18 @@ Several quantization methods are supported. They differ in the resulting model d
|
|||
|AWQ-Mistral 7B| ms/tok @ 4th | xxx| xxx | xxx | xxx |
|
||||
|AWQ-Mistral 7B| ms/tok @ 8th | xxx| xx | xx | xx |
|
||||
|AWQ-Mistral 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||
|
||||
### MPT 7B (Build with OpenBLAS)
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||
|-------------:|--------------|-------:|-------:|-------:|-------:|
|
||||
|Mistral 7B | perplexity | xxxxxx | xxxxxx | xxxxxx | xxxxxx |
|
||||
|Mistral 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G |
|
||||
|Mistral 7B | ms/tok @ 4th | xxx | xx | xx | xx |
|
||||
|Mistral 7B | ms/tok @ 8th | xxx | xx | xx | xx |
|
||||
|Mistral 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||
|AWQ-Mistral 7B| perplexity | xxxxxx | xxxxxx | xxxxx | xxxxxx |
|
||||
|AWQ-Mistral 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G |
|
||||
|AWQ-Mistral 7B| ms/tok @ 4th | xxx| xxx | xxx | xxx |
|
||||
|AWQ-Mistral 7B| ms/tok @ 8th | xxx| xx | xx | xx |
|
||||
|AWQ-Mistral 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
|
@ -1,14 +1,32 @@
|
|||
"""
|
||||
Original code from:
|
||||
1. https://github.com/casper-hansen/AutoAWQ
|
||||
2. https://github.com/mit-han-lab/llm-awq
|
||||
"""
|
||||
import os
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import AutoModelForCausalLM, AutoConfig
|
||||
|
||||
from transformers.models.bloom.modeling_bloom import BloomBlock, BloomGelu
|
||||
from transformers.models.opt.modeling_opt import OPTDecoderLayer
|
||||
from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm
|
||||
from transformers.models.bloom.modeling_bloom import BloomGelu
|
||||
from transformers.models.llama.modeling_llama import LlamaRMSNorm
|
||||
from transformers.activations import GELUActivation
|
||||
|
||||
|
||||
class ScaledActivation(nn.Module):
|
||||
"""
|
||||
ScaledActivation module wraps an existing activation function and applies a
|
||||
scale factor to its output.
|
||||
|
||||
Args:
|
||||
module (nn.Module): The activation function to be scaled.
|
||||
scales (torch.Tensor): A tensor of size (num_features,) containing the initial
|
||||
scale factors for each feature.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: The scaled output of the activation function.
|
||||
"""
|
||||
|
||||
def __init__(self, module, scales):
|
||||
super().__init__()
|
||||
self.act = module
|
||||
|
@ -17,11 +35,21 @@ class ScaledActivation(nn.Module):
|
|||
def forward(self, x):
|
||||
return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
|
||||
|
||||
|
||||
def set_op_by_name(layer, name, new_module):
|
||||
levels = name.split('.')
|
||||
"""
|
||||
Set the new module for given module's name.
|
||||
|
||||
Args:
|
||||
layer (nn.Module): The layer in which to replace the submodule.
|
||||
name (str): The path to the submodule to be replaced, using dot notation
|
||||
to access nested modules.
|
||||
new_module (nn.Module): The new module to replace the existing one.
|
||||
"""
|
||||
levels = name.split(".")
|
||||
if len(levels) > 1:
|
||||
mod_ = layer
|
||||
for l_idx in range(len(levels)-1):
|
||||
for l_idx in range(len(levels) - 1):
|
||||
if levels[l_idx].isdigit():
|
||||
mod_ = mod_[int(levels[l_idx])]
|
||||
else:
|
||||
|
@ -30,22 +58,45 @@ def set_op_by_name(layer, name, new_module):
|
|||
else:
|
||||
setattr(layer, name, new_module)
|
||||
|
||||
|
||||
def get_op_by_name(module, op_name):
|
||||
# get the op by its name relative to the module
|
||||
"""
|
||||
Retrieves a submodule within a given layer based on its name.
|
||||
|
||||
Args:
|
||||
module (nn.Module): The layer containing the submodule to find.
|
||||
op_name (str): The name of the submodule to search for, using dot notation for nested modules.
|
||||
|
||||
Returns:
|
||||
nn.Module: The requested submodule found within the given layer.
|
||||
|
||||
Raises:
|
||||
ValueError: If the specified submodule cannot be found within the layer.
|
||||
"""
|
||||
for name, m in module.named_modules():
|
||||
if name == op_name:
|
||||
return m
|
||||
raise ValueError(f"Cannot find op {op_name} in module {module}")
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def scale_ln_fcs(ln, fcs, scales):
|
||||
"""
|
||||
Scales the weights of a LayerNorm and a list of fully-connected layers proportionally.
|
||||
|
||||
Args:
|
||||
ln (nn.LayerNorm): The LayerNorm module to be scaled.
|
||||
fcs (List[nn.Linear]): A list of fully-connected layers to be scaled.
|
||||
scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
|
||||
"""
|
||||
|
||||
if not isinstance(fcs, list):
|
||||
fcs = [fcs]
|
||||
|
||||
scales = scales.to(ln.weight.device)
|
||||
|
||||
ln.weight.div_(scales)
|
||||
if hasattr(ln, 'bias') and ln.bias is not None:
|
||||
if hasattr(ln, "bias") and ln.bias is not None:
|
||||
ln.bias.div_(scales)
|
||||
|
||||
for fc in fcs:
|
||||
|
@ -60,14 +111,20 @@ def scale_ln_fcs(ln, fcs, scales):
|
|||
|
||||
@torch.no_grad()
|
||||
def scale_fc_fc(fc1, fc2, scales):
|
||||
"""
|
||||
Scales the weights of two fully-connected layers in a specific pattern.
|
||||
|
||||
Args:
|
||||
fc1 (nn.Linear): The first fully-connected layer to be scaled.
|
||||
fc2 (nn.Linear): The second fully-connected layer to be scaled.
|
||||
scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
|
||||
"""
|
||||
assert isinstance(fc1, nn.Linear)
|
||||
assert isinstance(fc2, nn.Linear)
|
||||
# assert fc1.out_features == fc2.in_features
|
||||
|
||||
scales = scales.to(fc1.weight.device)
|
||||
|
||||
# fc1.weight.div_(scales.view(-1, 1))
|
||||
fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
|
||||
fc1.weight[-scales.size(0) :].div_(scales.view(-1, 1))
|
||||
if fc1.bias is not None:
|
||||
fc1.bias.div_(scales.view(-1))
|
||||
|
||||
|
@ -81,6 +138,18 @@ def scale_fc_fc(fc1, fc2, scales):
|
|||
|
||||
@torch.no_grad()
|
||||
def scale_gelu_fc(gelu, fc, scales):
|
||||
"""
|
||||
Scales the weight of a GELU activation and a fully-connected layer proportionally.
|
||||
|
||||
Args:
|
||||
gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled.
|
||||
fc (nn.Linear): The fully-connected layer to be scaled.
|
||||
scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
|
||||
|
||||
Raises:
|
||||
TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`.
|
||||
TypeError: If the `fc` module is not of type `nn.Linear`.
|
||||
"""
|
||||
assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
|
||||
assert isinstance(fc, nn.Linear)
|
||||
|
||||
|
@ -91,6 +160,20 @@ def scale_gelu_fc(gelu, fc, scales):
|
|||
|
||||
|
||||
def apply_scale(module, scales_list, input_feat_dict=None):
|
||||
"""
|
||||
Applies different scaling strategies to layers based on their type and hierarchy within a given module.
|
||||
|
||||
Args:
|
||||
module (nn.Module): The module containing the layers to be scaled.
|
||||
scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing:
|
||||
* prev_op_name (str): The name of the preceding operation or module, relative to which the layers to be
|
||||
scaled are located.
|
||||
* layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation.
|
||||
* scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
|
||||
input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding
|
||||
input features (optional). If provided, the input features are also
|
||||
scaled proportionally after scaling the layer weights.
|
||||
"""
|
||||
for prev_op_name, layer_names, scales in scales_list:
|
||||
prev_op = get_op_by_name(module, prev_op_name)
|
||||
layers = [get_op_by_name(module, name) for name in layer_names]
|
||||
|
@ -103,15 +186,17 @@ def apply_scale(module, scales_list, input_feat_dict=None):
|
|||
if isinstance(prev_op, nn.Linear):
|
||||
assert len(layers) == 1
|
||||
scale_fc_fc(prev_op, layers[0], scales)
|
||||
elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)):
|
||||
elif (
|
||||
isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm))
|
||||
or "rmsnorm" in str(prev_op.__class__).lower()
|
||||
):
|
||||
scale_ln_fcs(prev_op, layers, scales)
|
||||
elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
|
||||
new_module = ScaledActivation(prev_op, scales)
|
||||
set_op_by_name(module, prev_op_name, new_module)
|
||||
scale_gelu_fc(prev_op, layers[0], scales)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"prev_op {type(prev_op)} not supported yet!")
|
||||
raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
|
||||
|
||||
# apply the scaling to input feat if given; prepare it for clipping
|
||||
if input_feat_dict is not None:
|
||||
|
@ -124,8 +209,18 @@ def apply_scale(module, scales_list, input_feat_dict=None):
|
|||
layer.cpu()
|
||||
scales.cpu()
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def apply_clip(module, clip_list):
|
||||
"""
|
||||
Applies element-wise clipping to the weight of a specific layer within a given module.
|
||||
|
||||
Args:
|
||||
module (nn.Module): The module containing the layer to be clipped.
|
||||
clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing:
|
||||
* name (str): The name of the layer to be clipped, relative to the root of the module.
|
||||
* max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight.
|
||||
"""
|
||||
for name, max_val in clip_list:
|
||||
layer = get_op_by_name(module, name)
|
||||
layer.cuda()
|
||||
|
@ -136,25 +231,23 @@ def apply_clip(module, clip_list):
|
|||
layer.weight.data = layer.weight.data.reshape(org_shape)
|
||||
layer.cpu()
|
||||
|
||||
def apply_awq(model, awq_results):
|
||||
apply_scale(model, awq_results["scale"])
|
||||
apply_clip(model, awq_results["clip"])
|
||||
|
||||
def add_scale_weights(model, model_path, scale_path, tmp_path):
|
||||
print("Loading pre-computed AWQ results from", str(scale_path))
|
||||
awq_results = torch.load(str(scale_path), map_location="cpu")
|
||||
apply_awq(model, awq_results)
|
||||
model.save_pretrained(str(tmp_path))
|
||||
os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")
|
||||
return True
|
||||
def add_scale_weights(model_path, scale_path, tmp_path):
|
||||
"""
|
||||
Adds pre-computed Activation Weight Quantization (AWQ) results to a model, including scaling factors and clipping bounds.
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
model_path = "/data/namtd12/llm_models/Llama-2-7b-hf"
|
||||
scale_path = "awq_cache_pretrained/llama-2-7b-chat-w4-g128.pt"
|
||||
tmp_path = "debug"
|
||||
Args:
|
||||
model_path (str): Path to the pre-trained model to be equipped with AWQ.
|
||||
scale_path (str): Path to the AWQ scale factors (.pt file).
|
||||
tmp_path (str): Path to the temporary directory where the equipped model will be saved.
|
||||
"""
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path, config=config, trust_remote_code=True)
|
||||
model_path, config=config, trust_remote_code=True
|
||||
)
|
||||
model.eval()
|
||||
add_scale_weights(model, scale_path, tmp_path)
|
||||
awq_results = torch.load(str(scale_path), map_location="cpu")
|
||||
apply_scale(model, awq_results["scale"])
|
||||
apply_clip(model, awq_results["clip"])
|
||||
model.save_pretrained(str(tmp_path))
|
||||
os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")
|
||||
|
|
1054
examples/awqutils/convert-awq-hf-to-gguf.py
Executable file
1054
examples/awqutils/convert-awq-hf-to-gguf.py
Executable file
File diff suppressed because it is too large
Load diff
|
@ -25,7 +25,7 @@ from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar
|
|||
|
||||
import numpy as np
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
from awqutils.apply_awq import add_scale_weights
|
||||
from apply_awq import add_scale_weights
|
||||
from transformers import AutoModelForCausalLM, AutoConfig
|
||||
|
||||
|
||||
|
@ -1158,11 +1158,7 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
|
||||
args = parser.parse_args(args_in)
|
||||
if args.awq_path and args.tmp_model_path:
|
||||
config = AutoConfig.from_pretrained(args.model, trust_remote_code=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
args.model, config=config, trust_remote_code=True)
|
||||
model.eval()
|
||||
add_scale_weights(model, args.model, args.awq_path, args.tmp_model_path)
|
||||
add_scale_weights(str(args.model), str(args.awq_path), str(args.tmp_model_path))
|
||||
args.model = args.tmp_model_path
|
||||
|
||||
if args.dump_single:
|
|
@ -114,6 +114,7 @@ class MODEL_TENSOR(IntEnum):
|
|||
FFN_GATE = auto()
|
||||
FFN_DOWN = auto()
|
||||
FFN_UP = auto()
|
||||
FFN_ACT = auto()
|
||||
FFN_NORM = auto()
|
||||
ATTN_Q_NORM = auto()
|
||||
ATTN_K_NORM = auto()
|
||||
|
@ -158,6 +159,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
|||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
||||
}
|
||||
|
||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
|
@ -251,6 +253,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
MODEL_TENSOR.FFN_ACT,
|
||||
],
|
||||
MODEL_ARCH.GPTJ: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
|
|
@ -164,6 +164,11 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.mlp.w1", # qwen
|
||||
),
|
||||
|
||||
# Awq-activation gate
|
||||
MODEL_TENSOR.FFN_ACT: (
|
||||
"transformer.blocks.{bid}.ffn.act", # mpt
|
||||
),
|
||||
|
||||
# Feed-forward gate
|
||||
MODEL_TENSOR.FFN_GATE: (
|
||||
"model.layers.{bid}.mlp.gate_proj", # llama-hf refact
|
||||
|
|
30
llama.cpp
30
llama.cpp
|
@ -341,6 +341,7 @@ enum llm_tensor {
|
|||
LLM_TENSOR_FFN_GATE,
|
||||
LLM_TENSOR_FFN_DOWN,
|
||||
LLM_TENSOR_FFN_UP,
|
||||
LLM_TENSOR_FFN_ACT,
|
||||
LLM_TENSOR_FFN_NORM,
|
||||
LLM_TENSOR_ATTN_Q_NORM,
|
||||
LLM_TENSOR_ATTN_K_NORM,
|
||||
|
@ -453,6 +454,7 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act"},
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
|
@ -1271,6 +1273,7 @@ struct llama_layer {
|
|||
// ff bias
|
||||
struct ggml_tensor * ffn_down_b; // b2
|
||||
struct ggml_tensor * ffn_up_b; // b3
|
||||
struct ggml_tensor *ffn_act;
|
||||
};
|
||||
|
||||
struct llama_kv_cell {
|
||||
|
@ -3420,6 +3423,7 @@ static void llm_load_tensors(
|
|||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||
layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend);
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
|
@ -3429,6 +3433,7 @@ static void llm_load_tensors(
|
|||
ggml_nbytes(layer.wo) +
|
||||
ggml_nbytes(layer.ffn_norm) +
|
||||
ggml_nbytes(layer.ffn_down) +
|
||||
ggml_nbytes(layer.ffn_act) +
|
||||
ggml_nbytes(layer.ffn_up);
|
||||
}
|
||||
}
|
||||
|
@ -3672,6 +3677,7 @@ enum llm_rope_type {
|
|||
enum llm_ffn_op_type {
|
||||
LLM_FFN_SILU,
|
||||
LLM_FFN_GELU,
|
||||
LLM_FFN_GELU_ACT,
|
||||
LLM_FFN_RELU,
|
||||
LLM_FFN_RELU_SQR,
|
||||
};
|
||||
|
@ -3839,6 +3845,7 @@ static struct ggml_tensor * llm_build_ffn(
|
|||
struct ggml_tensor * gate_b,
|
||||
struct ggml_tensor * down,
|
||||
struct ggml_tensor * down_b,
|
||||
struct ggml_tensor *act_scales,
|
||||
llm_ffn_op_type type_op,
|
||||
llm_ffn_gate_type type_gate,
|
||||
const llm_build_cb & cb,
|
||||
|
@ -3889,6 +3896,16 @@ static struct ggml_tensor * llm_build_ffn(
|
|||
cur = ggml_relu(ctx, cur);
|
||||
cb(cur, "ffn_relu", il);
|
||||
} break;
|
||||
case LLM_FFN_GELU_ACT:
|
||||
{
|
||||
cur = ggml_gelu(ctx, cur);
|
||||
cb(cur, "ffn_relu", il);
|
||||
struct ggml_tensor *repeat = ggml_repeat(ctx, act_scales, cur);
|
||||
cb(repeat, "ffn_repeat(scales)", il);
|
||||
cur = ggml_div(ctx, cur, repeat);
|
||||
cb(cur, "ffn_div(gelu)", il);
|
||||
}
|
||||
break;
|
||||
case LLM_FFN_RELU_SQR:
|
||||
{
|
||||
cur = ggml_relu(ctx, cur);
|
||||
|
@ -4194,6 +4211,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -4314,6 +4332,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -4432,6 +4451,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -4540,6 +4560,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -4748,6 +4769,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
NULL,
|
||||
LLM_FFN_RELU_SQR, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -4838,6 +4860,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -4937,6 +4960,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, model.layers[il].ffn_up_b,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, model.layers[il].ffn_down_b,
|
||||
NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -5031,7 +5055,8 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
model.layers[il].ffn_act,
|
||||
LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
|
@ -5143,6 +5168,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -5259,6 +5285,7 @@ struct llm_build_context {
|
|||
model.layers[il].ffn_up, NULL,
|
||||
model.layers[il].ffn_gate, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
@ -5441,6 +5468,7 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|||
{ "ffn_gate", OFFLOAD_FUNC },
|
||||
{ "ffn_gate_b", OFFLOAD_FUNC },
|
||||
{ "ffn_gate_par", OFFLOAD_FUNC },
|
||||
{"ffn_act", OFFLOAD_FUNC },
|
||||
{ "ffn_down", OFFLOAD_FUNC },
|
||||
{ "ffn_down_b", OFFLOAD_FUNC },
|
||||
{ "ffn_out", OFFLOAD_FUNC },
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue