Formatted other files
This commit is contained in:
parent
0610672b19
commit
c02f6df7c4
6 changed files with 19 additions and 19 deletions
|
@ -56,7 +56,7 @@ The perplexity measurements in table above are done against the `wikitext2` test
|
||||||
|
|
||||||
## Results
|
## Results
|
||||||
Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
|
Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
|
||||||
We use three types of llamacpp quantization methods to work with our version, including q4, q4_1, and q2_k
|
We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
|
||||||
|
|
||||||
### Llama 7B (Build with OpenBLAS)
|
### Llama 7B (Build with OpenBLAS)
|
||||||
|
|
||||||
|
|
|
@ -150,6 +150,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
params.seed = std::stoul(argv[i]);
|
params.seed = std::stoul(argv[i]);
|
||||||
} else if (arg == "-awq" || arg == "--use-awq") {
|
} else if (arg == "-awq" || arg == "--use-awq") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
params.use_awq = true;
|
params.use_awq = true;
|
||||||
} else if (arg == "-t" || arg == "--threads") {
|
} else if (arg == "-t" || arg == "--threads") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -806,7 +810,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" (can be specified more than once for multiple prompts).\n");
|
printf(" (can be specified more than once for multiple prompts).\n");
|
||||||
printf(" --color colorise output to distinguish prompt and user input from generations\n");
|
printf(" --color colorise output to distinguish prompt and user input from generations\n");
|
||||||
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
||||||
printf(" -awq SEED, -use-awq Using AWQ quantization model in inferences\n");
|
printf(" -awq, --use-awq Using AWQ quantization model in inferences\n");
|
||||||
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
|
||||||
printf(" -tb N, --threads-batch N\n");
|
printf(" -tb N, --threads-batch N\n");
|
||||||
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||||
|
|
|
@ -1029,7 +1029,6 @@ print(f"Loading model: {dir_model.name}")
|
||||||
|
|
||||||
hparams = Model.load_hparams(dir_model)
|
hparams = Model.load_hparams(dir_model)
|
||||||
|
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
||||||
|
|
|
@ -1163,7 +1163,7 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
print("Saving new weighted model ...")
|
print("Saving new weighted model ...")
|
||||||
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
|
||||||
print(f"Saved weighted model at {tmp_model_path}.")
|
print(f"Saved weighted model at {tmp_model_path}.")
|
||||||
args.model = tmp_model_path
|
args.model = tmp_model_path
|
||||||
|
|
||||||
if args.dump_single:
|
if args.dump_single:
|
||||||
model_plus = lazy_load_file(args.model)
|
model_plus = lazy_load_file(args.model)
|
||||||
|
|
|
@ -164,7 +164,7 @@ class TensorNameMap:
|
||||||
"transformer.h.{bid}.mlp.w1", # qwen
|
"transformer.h.{bid}.mlp.w1", # qwen
|
||||||
),
|
),
|
||||||
|
|
||||||
# Awq-activation gate
|
# AWQ-activation gate
|
||||||
MODEL_TENSOR.FFN_ACT: (
|
MODEL_TENSOR.FFN_ACT: (
|
||||||
"transformer.blocks.{bid}.ffn.act", # mpt
|
"transformer.blocks.{bid}.ffn.act", # mpt
|
||||||
),
|
),
|
||||||
|
|
25
llama.cpp
25
llama.cpp
|
@ -3937,20 +3937,17 @@ static struct ggml_tensor * llm_build_ffn(
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_tensor * llm_build_ffn_mpt_awq(
|
static struct ggml_tensor * llm_build_ffn_mpt_awq(
|
||||||
struct ggml_context *ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor *cur,
|
struct ggml_tensor * cur,
|
||||||
struct ggml_tensor *up,
|
struct ggml_tensor * up,
|
||||||
struct ggml_tensor *up_b,
|
struct ggml_tensor * up_b,
|
||||||
struct ggml_tensor *gate,
|
struct ggml_tensor * down,
|
||||||
struct ggml_tensor *gate_b,
|
struct ggml_tensor * down_b,
|
||||||
struct ggml_tensor *down,
|
struct ggml_tensor * act_scales,
|
||||||
struct ggml_tensor *down_b,
|
llm_ffn_op_type type_op,
|
||||||
struct ggml_tensor *act_scales,
|
llm_ffn_gate_type type_gate,
|
||||||
llm_ffn_op_type type_op,
|
const llm_build_cb & cb,
|
||||||
llm_ffn_gate_type type_gate,
|
int il) {
|
||||||
const llm_build_cb &cb,
|
|
||||||
int il)
|
|
||||||
{
|
|
||||||
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
|
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
|
||||||
cb(tmp, "ffn_up", il);
|
cb(tmp, "ffn_up", il);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue