update: work for bot mpt and awqmpt
This commit is contained in:
parent
8fece75e35
commit
8177ad4e37
5 changed files with 48 additions and 21 deletions
|
@ -149,6 +149,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.seed = std::stoul(argv[i]);
|
||||
} else if (arg == "-awq" || arg == "--use-awq") {
|
||||
params.use_awq = true;
|
||||
} else if (arg == "-t" || arg == "--threads") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -804,6 +806,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" (can be specified more than once for multiple prompts).\n");
|
||||
printf(" --color colorise output to distinguish prompt and user input from generations\n");
|
||||
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
||||
printf(" -awq SEED, -use-awq Using AWQ quantization model in inferences\n");
|
||||
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
|
||||
printf(" -tb N, --threads-batch N\n");
|
||||
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||
|
@ -1013,6 +1016,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params &
|
|||
mparams.tensor_split = params.tensor_split;
|
||||
mparams.use_mmap = params.use_mmap;
|
||||
mparams.use_mlock = params.use_mlock;
|
||||
mparams.use_awq = params.use_awq;
|
||||
if (params.kv_overrides.empty()) {
|
||||
mparams.kv_overrides = NULL;
|
||||
} else {
|
||||
|
@ -1096,13 +1100,11 @@ void llama_batch_add(
|
|||
|
||||
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
|
||||
auto mparams = llama_model_params_from_gpt_params(params);
|
||||
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
return std::make_tuple(nullptr, nullptr);
|
||||
}
|
||||
|
||||
auto cparams = llama_context_params_from_gpt_params(params);
|
||||
|
||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||
|
|
|
@ -125,6 +125,7 @@ struct gpt_params {
|
|||
bool infill = false; // use infill mode
|
||||
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||
bool no_kv_offload = false; // disable KV offloading
|
||||
bool use_awq = false; // use AWQ quantization infer
|
||||
|
||||
std::string cache_type_k = "f16"; // KV cache data type for the K
|
||||
std::string cache_type_v = "f16"; // KV cache data type for the V
|
||||
|
|
|
@ -46,7 +46,7 @@ class Model:
|
|||
self.part_names = self._get_part_names()
|
||||
self.hparams = Model.load_hparams(self.dir_model)
|
||||
self.model_arch = self._get_model_architecture()
|
||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess)
|
||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False)
|
||||
|
||||
def set_vocab(self):
|
||||
self._set_vocab_gpt2()
|
||||
|
@ -59,7 +59,7 @@ class Model:
|
|||
from safetensors import safe_open
|
||||
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
||||
else:
|
||||
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
|
||||
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", weights_only=True))
|
||||
|
||||
with ctx as model_part:
|
||||
for name in model_part.keys():
|
||||
|
@ -444,7 +444,7 @@ class MPTModel(Model):
|
|||
# map tensor names
|
||||
if "scales" in name:
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
|
||||
new_name = new_name + ".scales"
|
||||
new_name = new_name.replace("scales", "act.scales")
|
||||
else:
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
|
@ -1001,6 +1001,7 @@ dir_model = args.model
|
|||
if args.awq_path:
|
||||
from awqpy.apply_awq import add_scale_weights
|
||||
tmp_model_path = args.model / "weighted_model"
|
||||
dir_model = tmp_model_path
|
||||
if tmp_model_path.is_dir():
|
||||
print(f"{tmp_model_path} exists as a weighted model.")
|
||||
else:
|
||||
|
@ -1008,7 +1009,6 @@ if args.awq_path:
|
|||
print("Saving new weighted model ...")
|
||||
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
||||
print(f"Saved weighted model at {tmp_model_path}.")
|
||||
dir_model = tmp_model_path
|
||||
|
||||
if not dir_model.is_dir():
|
||||
print(f'Error: {args.model} is not a directory', file=sys.stderr)
|
||||
|
@ -1029,6 +1029,7 @@ print(f"Loading model: {dir_model.name}")
|
|||
|
||||
hparams = Model.load_hparams(dir_model)
|
||||
|
||||
|
||||
with torch.inference_mode():
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
||||
|
|
52
llama.cpp
52
llama.cpp
|
@ -1178,6 +1178,7 @@ struct llama_hparams {
|
|||
|
||||
float f_clamp_kqv;
|
||||
float f_max_alibi_bias;
|
||||
bool use_awq;
|
||||
|
||||
bool operator!=(const llama_hparams & other) const {
|
||||
if (this->vocab_only != other.vocab_only) return true;
|
||||
|
@ -3379,7 +3380,6 @@ static void llm_load_tensors(
|
|||
case LLM_ARCH_MPT:
|
||||
{
|
||||
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
||||
|
||||
// output
|
||||
{
|
||||
ggml_backend_type backend_norm;
|
||||
|
@ -3423,18 +3423,31 @@ static void llm_load_tensors(
|
|||
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
||||
|
||||
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||
layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend);
|
||||
if (model.hparams.use_awq) {
|
||||
layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend);
|
||||
}
|
||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights +=
|
||||
if (model.hparams.use_awq) {
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attn_norm) +
|
||||
ggml_nbytes(layer.wqkv) +
|
||||
ggml_nbytes(layer.wo) +
|
||||
ggml_nbytes(layer.ffn_norm) +
|
||||
ggml_nbytes(layer.ffn_down) +
|
||||
ggml_nbytes(layer.ffn_act) +
|
||||
ggml_nbytes(layer.ffn_act) +
|
||||
ggml_nbytes(layer.ffn_up);
|
||||
}
|
||||
else {
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attn_norm) +
|
||||
ggml_nbytes(layer.wqkv) +
|
||||
ggml_nbytes(layer.wo) +
|
||||
ggml_nbytes(layer.ffn_norm) +
|
||||
ggml_nbytes(layer.ffn_down) +
|
||||
ggml_nbytes(layer.ffn_up);
|
||||
}
|
||||
}
|
||||
}
|
||||
} break;
|
||||
|
@ -3634,7 +3647,7 @@ static bool llama_model_load(const std::string & fname, llama_model & model, con
|
|||
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
||||
|
||||
model.hparams.vocab_only = params.vocab_only;
|
||||
|
||||
model.hparams.use_awq = params.use_awq;
|
||||
llm_load_arch (ml, model);
|
||||
llm_load_hparams(ml, model);
|
||||
llm_load_vocab (ml, model);
|
||||
|
@ -5119,13 +5132,23 @@ struct llm_build_context {
|
|||
NULL,
|
||||
LLM_NORM, cb, il);
|
||||
cb(cur, "ffn_norm", il);
|
||||
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
model.layers[il].ffn_act,
|
||||
LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il);
|
||||
if (hparams.use_awq) {
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
model.layers[il].ffn_act,
|
||||
LLM_FFN_GELU_ACT, LLM_FFN_SEQ, cb, il);
|
||||
|
||||
}
|
||||
else {
|
||||
cur = llm_build_ffn(ctx0, cur,
|
||||
model.layers[il].ffn_up, NULL,
|
||||
NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL,
|
||||
LLM_FFN_GELU, LLM_FFN_SEQ, cb, il);
|
||||
|
||||
}
|
||||
cb(cur, "ffn_out", il);
|
||||
}
|
||||
|
||||
|
@ -8841,6 +8864,7 @@ struct llama_model_params llama_model_default_params() {
|
|||
/*.progress_callback_user_data =*/ nullptr,
|
||||
/*.kv_overrides =*/ nullptr,
|
||||
/*.vocab_only =*/ false,
|
||||
/*.use_awq =*/ false,
|
||||
/*.use_mmap =*/ true,
|
||||
/*.use_mlock =*/ false,
|
||||
};
|
||||
|
@ -8936,9 +8960,7 @@ struct llama_model * llama_load_model_from_file(
|
|||
const char * path_model,
|
||||
struct llama_model_params params) {
|
||||
ggml_time_init();
|
||||
|
||||
llama_model * model = new llama_model;
|
||||
|
||||
unsigned cur_percentage = 0;
|
||||
if (params.progress_callback == NULL) {
|
||||
params.progress_callback_user_data = &cur_percentage;
|
||||
|
@ -9065,7 +9087,7 @@ struct llama_context * llama_new_context_with_model(
|
|||
if (params.embedding){
|
||||
ctx->embedding.resize(hparams.n_embd);
|
||||
}
|
||||
|
||||
|
||||
{
|
||||
static const size_t tensor_alignment = 32;
|
||||
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
||||
|
|
1
llama.h
1
llama.h
|
@ -192,6 +192,7 @@ extern "C" {
|
|||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool use_awq; // whether to use awq quantization
|
||||
};
|
||||
|
||||
struct llama_context_params {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue