update: awq support llama-7b model

2023-12-14 15:41:41 +07:00 · 2023-12-14 15:41:41 +07:00 · 2ea3934ec3
commit 2ea3934ec3
parent e18f7345a3
4 changed files with 1470 additions and 0 deletions
--- a/awqutils/README.md
+++ b/awqutils/README.md
@ -0,0 +1,67 @@
+# AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
+[[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
+
+## Contents
+
+- [Install](##Install)
+- [Convert](##Convert)
+- [Quantize](##Quantize)
+- [Benchmark](##Benchmark)
+- [Results](##Results)
+
+## Install
+Install requirements
+```bash
+pip install -r requirements.txt
+```
+Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
+```bash 
+git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
+```
+
+## Convert
+Example for llama 7b model
+```bash
+python convert-awq-hf-to-gguf.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --tmp-model-path models/llama-7b-scales --outfile models/llama_7b_fp16.gguf
+```
+
+## Quantize
+```bash
+./build/bin/quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
+```
+
+## Benchmark
+The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
+```bash
+./build/bin/perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
+```
+
+## Results
+
+### Memory/Disk Requirements
+
+Llama 7B
+
+| Model |     Original  |     AWQ-4bit  | 
+|------:|--------------:|--------------:|
+|  fp16 |     12.853 GB |     12.853 GB |
+|  q4_0 |     3.647  GB |     3.647  GB |
+|  q4_1 |     4.041  GB |     4.041  GB |
+|  q2_k |     2.649  GB |     2.649  GB |
+
+### Quantization
+
+Several quantization methods are supported. They differ in the resulting model disk size and inference speed.
+
+| Model      | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
+|-----------:|--------------|-------:|-------:|-------:|-------:|
+|Llama 7B    | perplexity   | 5.9066 | 6.1214 | 6.0643 | xxxxxx|
+|Llama 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
+|Llama 7B    | ms/tok @ 4th |    xxx |     xx |     xx |     xx |
+|Llama 7B    | ms/tok @ 8th |    xxx |     xx |     xx |     xx |
+|Llama 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
+|AWQ-LLama 7B| perplexity   | 5.9175 | 6.0252 | xxxxxx | xxxxx |
+|AWQ-LLama 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
+|AWQ-LLama 7B| ms/tok @ 4th |     xxx|    xxx |    xxx |    xxx |
+|AWQ-LLama 7B| ms/tok @ 8th |     xxx|     xx |     xx |     xx |
+|AWQ-LLama 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
--- a/awqutils/apply_awq.py
+++ b/awqutils/apply_awq.py
@ -0,0 +1,160 @@
+import os
+import torch
+import torch.nn as nn
+from transformers import AutoModelForCausalLM, AutoConfig
+
+from transformers.models.bloom.modeling_bloom import BloomBlock, BloomGelu
+from transformers.models.opt.modeling_opt import OPTDecoderLayer
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm
+from transformers.activations import GELUActivation
+
+class ScaledActivation(nn.Module):
+    def __init__(self, module, scales):
+        super().__init__()
+        self.act = module
+        self.scales = nn.Parameter(scales.data)
+    
+    def forward(self, x):
+        return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
+
+def set_op_by_name(layer, name, new_module):
+    levels = name.split('.')
+    if len(levels) > 1:
+        mod_ = layer
+        for l_idx in range(len(levels)-1):
+            if levels[l_idx].isdigit():
+                mod_ = mod_[int(levels[l_idx])]
+            else:
+                mod_ = getattr(mod_, levels[l_idx])
+        setattr(mod_, levels[-1], new_module)
+    else:
+        setattr(layer, name, new_module)
+
+def get_op_by_name(module, op_name):
+    # get the op by its name relative to the module
+    for name, m in module.named_modules():
+        if name == op_name:
+            return m
+    raise ValueError(f"Cannot find op {op_name} in module {module}")
+
+@torch.no_grad()
+def scale_ln_fcs(ln, fcs, scales):
+    if not isinstance(fcs, list):
+        fcs = [fcs]
+    
+    scales = scales.to(ln.weight.device)
+
+    ln.weight.div_(scales)
+    if hasattr(ln, 'bias') and ln.bias is not None:
+        ln.bias.div_(scales)
+
+    for fc in fcs:
+        fc.weight.mul_(scales.view(1, -1))
+
+    for p in ln.parameters():
+        assert torch.isnan(p).sum() == 0
+    for fc in fcs:
+        for p in fc.parameters():
+            assert torch.isnan(p).sum() == 0
+
+
+@torch.no_grad()
+def scale_fc_fc(fc1, fc2, scales):
+    assert isinstance(fc1, nn.Linear)
+    assert isinstance(fc2, nn.Linear)
+    # assert fc1.out_features == fc2.in_features
+    
+    scales = scales.to(fc1.weight.device)
+
+    # fc1.weight.div_(scales.view(-1, 1))
+    fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
+    if fc1.bias is not None:
+        fc1.bias.div_(scales.view(-1))
+
+    fc2.weight.mul_(scales.view(1, -1))
+
+    for p in fc1.parameters():
+        assert torch.isnan(p).sum() == 0
+    for p in fc2.parameters():
+        assert torch.isnan(p).sum() == 0
+
+
+@torch.no_grad()
+def scale_gelu_fc(gelu, fc, scales):
+    assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
+    assert isinstance(fc, nn.Linear)
+
+    fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
+
+    for p in fc.parameters():
+        assert torch.isnan(p).sum() == 0
+
+
+def apply_scale(module, scales_list, input_feat_dict=None):
+    for prev_op_name, layer_names, scales in scales_list:
+        prev_op = get_op_by_name(module, prev_op_name)
+        layers = [get_op_by_name(module, name) for name in layer_names]
+
+        prev_op.cuda()
+        for layer in layers:
+            layer.cuda()
+        scales.cuda()
+        
+        if isinstance(prev_op, nn.Linear):
+            assert len(layers) == 1
+            scale_fc_fc(prev_op, layers[0], scales)
+        elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)):
+            scale_ln_fcs(prev_op, layers, scales)
+        elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
+            new_module = ScaledActivation(prev_op, scales)
+            set_op_by_name(module, prev_op_name, new_module)
+            scale_gelu_fc(prev_op, layers[0], scales)
+        else:
+            raise NotImplementedError(
+                f"prev_op {type(prev_op)} not supported yet!")
+            
+        # apply the scaling to input feat if given; prepare it for clipping
+        if input_feat_dict is not None:  
+            for layer_name in layer_names:
+                inp = input_feat_dict[layer_name]
+                inp.div_(scales.view(1, -1).to(inp.device))
+
+        prev_op.cpu()
+        for layer in layers:
+            layer.cpu()
+        scales.cpu()
+
+@torch.no_grad()
+def apply_clip(module, clip_list):
+    for name, max_val in clip_list:
+        layer = get_op_by_name(module, name)
+        layer.cuda()
+        max_val = max_val.to(layer.weight.device)
+        org_shape = layer.weight.shape
+        layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
+        layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
+        layer.weight.data = layer.weight.data.reshape(org_shape)
+        layer.cpu()
+
+def apply_awq(model, awq_results):
+    apply_scale(model, awq_results["scale"])
+    apply_clip(model, awq_results["clip"])
+
+def add_scale_weights(model, model_path, scale_path, tmp_path):
+    print("Loading pre-computed AWQ results from", str(scale_path))
+    awq_results = torch.load(str(scale_path), map_location="cpu")
+    apply_awq(model, awq_results)
+    model.save_pretrained(str(tmp_path))
+    os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")
+    return True
+
+
+if __name__ == "__main__":
+    model_path = "/data/namtd12/llm_models/Llama-2-7b-hf"
+    scale_path = "awq_cache_pretrained/llama-2-7b-chat-w4-g128.pt"
+    tmp_path = "debug"
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+            model_path, config=config, trust_remote_code=True)
+    model.eval()
+    add_scale_weights(model, scale_path, tmp_path)
--- a/awqutils/requirements.txt
+++ b/awqutils/requirements.txt
@ -0,0 +1,2 @@
+torch>=2.0.0
+transformers>=4.32.0
--- a/convert-awq-hf-to-gguf.py
+++ b/convert-awq-hf-to-gguf.py