update: awq support llama-7b model
This commit is contained in:
parent
e18f7345a3
commit
2ea3934ec3
4 changed files with 1470 additions and 0 deletions
67
awqutils/README.md
Normal file
67
awqutils/README.md
Normal file
|
@ -0,0 +1,67 @@
|
|||
# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
|
||||
[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
|
||||
|
||||
## Contents
|
||||
|
||||
- [Install](##Install)
|
||||
- [Convert](##Convert)
|
||||
- [Quantize](##Quantize)
|
||||
- [Benchmark](##Benchmark)
|
||||
- [Results](##Results)
|
||||
|
||||
## Install
|
||||
Install requirements
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
|
||||
```bash
|
||||
git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
|
||||
```
|
||||
|
||||
## Convert
|
||||
Example for llama 7b model
|
||||
```bash
|
||||
python convert-awq-hf-to-gguf.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --tmp-model-path models/llama-7b-scales --outfile models/llama_7b_fp16.gguf
|
||||
```
|
||||
|
||||
## Quantize
|
||||
```bash
|
||||
./build/bin/quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
|
||||
```
|
||||
|
||||
## Benchmark
|
||||
The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
|
||||
```bash
|
||||
./build/bin/perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
|
||||
```
|
||||
|
||||
## Results
|
||||
|
||||
### Memory/Disk Requirements
|
||||
|
||||
Llama 7B
|
||||
|
||||
| Model | Original | AWQ-4bit |
|
||||
|------:|--------------:|--------------:|
|
||||
| fp16 | 12.853 GB | 12.853 GB |
|
||||
| q4_0 | 3.647 GB | 3.647 GB |
|
||||
| q4_1 | 4.041 GB | 4.041 GB |
|
||||
| q2_k | 2.649 GB | 2.649 GB |
|
||||
|
||||
### Quantization
|
||||
|
||||
Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
|
||||
|
||||
| Model | Measure | F16 | Q4_0 | Q4_1 | Q2_K |
|
||||
|-----------:|--------------|-------:|-------:|-------:|-------:|
|
||||
|Llama 7B | perplexity | 5.9066 | 6.1214 | 6.0643 | xxxxxx|
|
||||
|Llama 7B | file size | 12.9G | 3.5G | 3.9G | 2.7G |
|
||||
|Llama 7B | ms/tok @ 4th | xxx | xx | xx | xx |
|
||||
|Llama 7B | ms/tok @ 8th | xxx | xx | xx | xx |
|
||||
|Llama 7B | bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
||||
|AWQ-LLama 7B| perplexity | 5.9175 | 6.0252 | xxxxxx | xxxxx |
|
||||
|AWQ-LLama 7B| file size | 12.9G | 3.5G | 3.9G | 2.7G |
|
||||
|AWQ-LLama 7B| ms/tok @ 4th | xxx| xxx | xxx | xxx |
|
||||
|AWQ-LLama 7B| ms/tok @ 8th | xxx| xx | xx | xx |
|
||||
|AWQ-LLama 7B| bits/weight | 16.0 | 4.5 | 5.0 | 2.6 |
|
160
awqutils/apply_awq.py
Normal file
160
awqutils/apply_awq.py
Normal file
|
@ -0,0 +1,160 @@
|
|||
import os
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import AutoModelForCausalLM, AutoConfig
|
||||
|
||||
from transformers.models.bloom.modeling_bloom import BloomBlock, BloomGelu
|
||||
from transformers.models.opt.modeling_opt import OPTDecoderLayer
|
||||
from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm
|
||||
from transformers.activations import GELUActivation
|
||||
|
||||
class ScaledActivation(nn.Module):
|
||||
def __init__(self, module, scales):
|
||||
super().__init__()
|
||||
self.act = module
|
||||
self.scales = nn.Parameter(scales.data)
|
||||
|
||||
def forward(self, x):
|
||||
return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
|
||||
|
||||
def set_op_by_name(layer, name, new_module):
|
||||
levels = name.split('.')
|
||||
if len(levels) > 1:
|
||||
mod_ = layer
|
||||
for l_idx in range(len(levels)-1):
|
||||
if levels[l_idx].isdigit():
|
||||
mod_ = mod_[int(levels[l_idx])]
|
||||
else:
|
||||
mod_ = getattr(mod_, levels[l_idx])
|
||||
setattr(mod_, levels[-1], new_module)
|
||||
else:
|
||||
setattr(layer, name, new_module)
|
||||
|
||||
def get_op_by_name(module, op_name):
|
||||
# get the op by its name relative to the module
|
||||
for name, m in module.named_modules():
|
||||
if name == op_name:
|
||||
return m
|
||||
raise ValueError(f"Cannot find op {op_name} in module {module}")
|
||||
|
||||
@torch.no_grad()
|
||||
def scale_ln_fcs(ln, fcs, scales):
|
||||
if not isinstance(fcs, list):
|
||||
fcs = [fcs]
|
||||
|
||||
scales = scales.to(ln.weight.device)
|
||||
|
||||
ln.weight.div_(scales)
|
||||
if hasattr(ln, 'bias') and ln.bias is not None:
|
||||
ln.bias.div_(scales)
|
||||
|
||||
for fc in fcs:
|
||||
fc.weight.mul_(scales.view(1, -1))
|
||||
|
||||
for p in ln.parameters():
|
||||
assert torch.isnan(p).sum() == 0
|
||||
for fc in fcs:
|
||||
for p in fc.parameters():
|
||||
assert torch.isnan(p).sum() == 0
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def scale_fc_fc(fc1, fc2, scales):
|
||||
assert isinstance(fc1, nn.Linear)
|
||||
assert isinstance(fc2, nn.Linear)
|
||||
# assert fc1.out_features == fc2.in_features
|
||||
|
||||
scales = scales.to(fc1.weight.device)
|
||||
|
||||
# fc1.weight.div_(scales.view(-1, 1))
|
||||
fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
|
||||
if fc1.bias is not None:
|
||||
fc1.bias.div_(scales.view(-1))
|
||||
|
||||
fc2.weight.mul_(scales.view(1, -1))
|
||||
|
||||
for p in fc1.parameters():
|
||||
assert torch.isnan(p).sum() == 0
|
||||
for p in fc2.parameters():
|
||||
assert torch.isnan(p).sum() == 0
|
||||
|
||||
|
||||
@torch.no_grad()
|
||||
def scale_gelu_fc(gelu, fc, scales):
|
||||
assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
|
||||
assert isinstance(fc, nn.Linear)
|
||||
|
||||
fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
|
||||
|
||||
for p in fc.parameters():
|
||||
assert torch.isnan(p).sum() == 0
|
||||
|
||||
|
||||
def apply_scale(module, scales_list, input_feat_dict=None):
|
||||
for prev_op_name, layer_names, scales in scales_list:
|
||||
prev_op = get_op_by_name(module, prev_op_name)
|
||||
layers = [get_op_by_name(module, name) for name in layer_names]
|
||||
|
||||
prev_op.cuda()
|
||||
for layer in layers:
|
||||
layer.cuda()
|
||||
scales.cuda()
|
||||
|
||||
if isinstance(prev_op, nn.Linear):
|
||||
assert len(layers) == 1
|
||||
scale_fc_fc(prev_op, layers[0], scales)
|
||||
elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)):
|
||||
scale_ln_fcs(prev_op, layers, scales)
|
||||
elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
|
||||
new_module = ScaledActivation(prev_op, scales)
|
||||
set_op_by_name(module, prev_op_name, new_module)
|
||||
scale_gelu_fc(prev_op, layers[0], scales)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"prev_op {type(prev_op)} not supported yet!")
|
||||
|
||||
# apply the scaling to input feat if given; prepare it for clipping
|
||||
if input_feat_dict is not None:
|
||||
for layer_name in layer_names:
|
||||
inp = input_feat_dict[layer_name]
|
||||
inp.div_(scales.view(1, -1).to(inp.device))
|
||||
|
||||
prev_op.cpu()
|
||||
for layer in layers:
|
||||
layer.cpu()
|
||||
scales.cpu()
|
||||
|
||||
@torch.no_grad()
|
||||
def apply_clip(module, clip_list):
|
||||
for name, max_val in clip_list:
|
||||
layer = get_op_by_name(module, name)
|
||||
layer.cuda()
|
||||
max_val = max_val.to(layer.weight.device)
|
||||
org_shape = layer.weight.shape
|
||||
layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
|
||||
layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
|
||||
layer.weight.data = layer.weight.data.reshape(org_shape)
|
||||
layer.cpu()
|
||||
|
||||
def apply_awq(model, awq_results):
|
||||
apply_scale(model, awq_results["scale"])
|
||||
apply_clip(model, awq_results["clip"])
|
||||
|
||||
def add_scale_weights(model, model_path, scale_path, tmp_path):
|
||||
print("Loading pre-computed AWQ results from", str(scale_path))
|
||||
awq_results = torch.load(str(scale_path), map_location="cpu")
|
||||
apply_awq(model, awq_results)
|
||||
model.save_pretrained(str(tmp_path))
|
||||
os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
model_path = "/data/namtd12/llm_models/Llama-2-7b-hf"
|
||||
scale_path = "awq_cache_pretrained/llama-2-7b-chat-w4-g128.pt"
|
||||
tmp_path = "debug"
|
||||
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path, config=config, trust_remote_code=True)
|
||||
model.eval()
|
||||
add_scale_weights(model, scale_path, tmp_path)
|
2
awqutils/requirements.txt
Normal file
2
awqutils/requirements.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
torch>=2.0.0
|
||||
transformers>=4.32.0
|
1241
convert-awq-hf-to-gguf.py
Executable file
1241
convert-awq-hf-to-gguf.py
Executable file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue