Formatted other files

This commit is contained in:
Le Hoang Anh 2023-12-20 11:04:03 +07:00
parent 0610672b19
commit c02f6df7c4
6 changed files with 19 additions and 19 deletions

View file

@ -56,7 +56,7 @@ The perplexity measurements in table above are done against the `wikitext2` test
## Results ## Results
Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
We use three types of llamacpp quantization methods to work with our version, including q4, q4_1, and q2_k We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
### Llama 7B (Build with OpenBLAS) ### Llama 7B (Build with OpenBLAS)

View file

@ -150,6 +150,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
} }
params.seed = std::stoul(argv[i]); params.seed = std::stoul(argv[i]);
} else if (arg == "-awq" || arg == "--use-awq") { } else if (arg == "-awq" || arg == "--use-awq") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.use_awq = true; params.use_awq = true;
} else if (arg == "-t" || arg == "--threads") { } else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) { if (++i >= argc) {
@ -806,7 +810,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" (can be specified more than once for multiple prompts).\n"); printf(" (can be specified more than once for multiple prompts).\n");
printf(" --color colorise output to distinguish prompt and user input from generations\n"); printf(" --color colorise output to distinguish prompt and user input from generations\n");
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
printf(" -awq SEED, -use-awq Using AWQ quantization model in inferences\n"); printf(" -awq, --use-awq Using AWQ quantization model in inferences\n");
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads); printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
printf(" -tb N, --threads-batch N\n"); printf(" -tb N, --threads-batch N\n");
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n"); printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");

View file

@ -1029,7 +1029,6 @@ print(f"Loading model: {dir_model.name}")
hparams = Model.load_hparams(dir_model) hparams = Model.load_hparams(dir_model)
with torch.inference_mode(): with torch.inference_mode():
model_class = Model.from_model_architecture(hparams["architectures"][0]) model_class = Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)

View file

@ -1163,7 +1163,7 @@ def main(args_in: list[str] | None = None) -> None:
print("Saving new weighted model ...") print("Saving new weighted model ...")
add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path))
print(f"Saved weighted model at {tmp_model_path}.") print(f"Saved weighted model at {tmp_model_path}.")
args.model = tmp_model_path args.model = tmp_model_path
if args.dump_single: if args.dump_single:
model_plus = lazy_load_file(args.model) model_plus = lazy_load_file(args.model)

View file

@ -164,7 +164,7 @@ class TensorNameMap:
"transformer.h.{bid}.mlp.w1", # qwen "transformer.h.{bid}.mlp.w1", # qwen
), ),
# Awq-activation gate # AWQ-activation gate
MODEL_TENSOR.FFN_ACT: ( MODEL_TENSOR.FFN_ACT: (
"transformer.blocks.{bid}.ffn.act", # mpt "transformer.blocks.{bid}.ffn.act", # mpt
), ),

View file

@ -3937,20 +3937,17 @@ static struct ggml_tensor * llm_build_ffn(
} }
static struct ggml_tensor * llm_build_ffn_mpt_awq( static struct ggml_tensor * llm_build_ffn_mpt_awq(
struct ggml_context *ctx, struct ggml_context * ctx,
struct ggml_tensor *cur, struct ggml_tensor * cur,
struct ggml_tensor *up, struct ggml_tensor * up,
struct ggml_tensor *up_b, struct ggml_tensor * up_b,
struct ggml_tensor *gate, struct ggml_tensor * down,
struct ggml_tensor *gate_b, struct ggml_tensor * down_b,
struct ggml_tensor *down, struct ggml_tensor * act_scales,
struct ggml_tensor *down_b, llm_ffn_op_type type_op,
struct ggml_tensor *act_scales, llm_ffn_gate_type type_gate,
llm_ffn_op_type type_op, const llm_build_cb & cb,
llm_ffn_gate_type type_gate, int il) {
const llm_build_cb &cb,
int il)
{
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur); struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
cb(tmp, "ffn_up", il); cb(tmp, "ffn_up", il);