Formatted other files
This commit is contained in:
parent
0610672b19
commit
c02f6df7c4
6 changed files with 19 additions and 19 deletions
|
@ -56,7 +56,7 @@ The perplexity measurements in table above are done against the `wikitext2` test
|
|||
|
||||
## Results
|
||||
Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
|
||||
We use three types of llamacpp quantization methods to work with our version, including q4, q4_1, and q2_k
|
||||
We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
|
||||
|
||||
### Llama 7B (Build with OpenBLAS)
|
||||
|
||||
|
|
|
@ -150,6 +150,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
}
|
||||
params.seed = std::stoul(argv[i]);
|
||||
} else if (arg == "-awq" || arg == "--use-awq") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.use_awq = true;
|
||||
} else if (arg == "-t" || arg == "--threads") {
|
||||
if (++i >= argc) {
|
||||
|
@ -806,7 +810,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" (can be specified more than once for multiple prompts).\n");
|
||||
printf(" --color colorise output to distinguish prompt and user input from generations\n");
|
||||
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
||||
printf(" -awq SEED, -use-awq Using AWQ quantization model in inferences\n");
|
||||
printf(" -awq, --use-awq Using AWQ quantization model in inferences\n");
|
||||
printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
|
||||
printf(" -tb N, --threads-batch N\n");
|
||||
printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||
|
|
|
@ -1029,7 +1029,6 @@ print(f"Loading model: {dir_model.name}")
|
|||
|
||||
hparams = Model.load_hparams(dir_model)
|
||||
|
||||
|
||||
with torch.inference_mode():
|
||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
||||
|
|
|
@ -164,7 +164,7 @@ class TensorNameMap:
|
|||
"transformer.h.{bid}.mlp.w1", # qwen
|
||||
),
|
||||
|
||||
# Awq-activation gate
|
||||
# AWQ-activation gate
|
||||
MODEL_TENSOR.FFN_ACT: (
|
||||
"transformer.blocks.{bid}.ffn.act", # mpt
|
||||
),
|
||||
|
|
|
@ -3941,16 +3941,13 @@ static struct ggml_tensor * llm_build_ffn_mpt_awq(
|
|||
struct ggml_tensor * cur,
|
||||
struct ggml_tensor * up,
|
||||
struct ggml_tensor * up_b,
|
||||
struct ggml_tensor *gate,
|
||||
struct ggml_tensor *gate_b,
|
||||
struct ggml_tensor * down,
|
||||
struct ggml_tensor * down_b,
|
||||
struct ggml_tensor * act_scales,
|
||||
llm_ffn_op_type type_op,
|
||||
llm_ffn_gate_type type_gate,
|
||||
const llm_build_cb & cb,
|
||||
int il)
|
||||
{
|
||||
int il) {
|
||||
struct ggml_tensor * tmp = ggml_mul_mat(ctx, up, cur);
|
||||
cb(tmp, "ffn_up", il);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue