Formatted other files

This commit is contained in:
Le Hoang Anh 2023-12-20 11:04:03 +07:00
parent 0610672b19
commit c02f6df7c4
6 changed files with 19 additions and 19 deletions

View file

@ -56,7 +56,7 @@ The perplexity measurements in table above are done against the `wikitext2` test
## Results
Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
We use three types of llamacpp quantization methods to work with our version, including q4, q4_1, and q2_k
We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
### Llama 7B (Build with OpenBLAS)

View file

@ -150,6 +150,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
}
params.seed = std::stoul(argv[i]);
} else if (arg == "-awq" || arg == "--use-awq") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.use_awq = true;
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
@ -806,7 +810,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" (can be specified more than once for multiple prompts).\n");
printf(" --color colorise output to distinguish prompt and user input from generations\n");
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
printf(" -awq SEED, -use-awq Using AWQ quantization model in inferences\n");
printf(" -awq, --use-awq Using AWQ quantization model in inferences\n");
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
printf(" -tb N, --threads-batch N\n");
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");

View file

@ -1029,7 +1029,6 @@ print(f"Loading model: {dir_model.name}")
hparams = Model.load_hparams(dir_model)
with torch.inference_mode():
model_class = Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)

View file

@ -164,7 +164,7 @@ class TensorNameMap:
"transformer.h.{bid}.mlp.w1", # qwen
),
# Awq-activation gate
# AWQ-activation gate
MODEL_TENSOR.FFN_ACT: (
"transformer.blocks.{bid}.ffn.act", # mpt
),

View file

@ -3937,20 +3937,17 @@ static struct ggml_tensor * llm_build_ffn(
}
static struct ggml_tensor * llm_build_ffn_mpt_awq(
struct ggml_context *ctx,
struct ggml_tensor *cur,
struct ggml_tensor *up,
struct ggml_tensor *up_b,
struct ggml_tensor *gate,
struct ggml_tensor *gate_b,
struct ggml_tensor *down,
struct ggml_tensor *down_b,
struct ggml_tensor *act_scales,
struct ggml_context * ctx,
struct ggml_tensor * cur,
struct ggml_tensor * up,
struct ggml_tensor * up_b,
struct ggml_tensor * down,
struct ggml_tensor * down_b,
struct ggml_tensor * act_scales,
llm_ffn_op_type type_op,
llm_ffn_gate_type type_gate,
const llm_build_cb &cb,
int il)
{
const llm_build_cb & cb,
int il) {
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
cb(tmp, "ffn_up", il);