Merge branch 'master' into concedo_experimental
# Conflicts: # CMakeLists.txt # Makefile
This commit is contained in:
commit
0bf75b05dc
10 changed files with 380 additions and 282 deletions
|
@ -584,109 +584,109 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stdout, "usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
fprintf(stdout, "\n");
|
printf("\n");
|
||||||
fprintf(stdout, "options:\n");
|
printf("options:\n");
|
||||||
fprintf(stdout, " -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
fprintf(stdout, " -i, --interactive run in interactive mode\n");
|
printf(" -i, --interactive run in interactive mode\n");
|
||||||
fprintf(stdout, " --interactive-first run in interactive mode and wait for input right away\n");
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
fprintf(stdout, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||||
fprintf(stdout, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
||||||
fprintf(stdout, " -r PROMPT, --reverse-prompt PROMPT\n");
|
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
|
||||||
fprintf(stdout, " halt generation at PROMPT, return control in interactive mode\n");
|
printf(" halt generation at PROMPT, return control in interactive mode\n");
|
||||||
fprintf(stdout, " (can be specified more than once for multiple prompts).\n");
|
printf(" (can be specified more than once for multiple prompts).\n");
|
||||||
fprintf(stdout, " --color colorise output to distinguish prompt and user input from generations\n");
|
printf(" --color colorise output to distinguish prompt and user input from generations\n");
|
||||||
fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
|
||||||
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
fprintf(stdout, " -p PROMPT, --prompt PROMPT\n");
|
printf(" -p PROMPT, --prompt PROMPT\n");
|
||||||
fprintf(stdout, " prompt to start generation with (default: empty)\n");
|
printf(" prompt to start generation with (default: empty)\n");
|
||||||
fprintf(stdout, " -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
|
printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
|
||||||
fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
|
printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
|
||||||
fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
|
printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
|
||||||
fprintf(stdout, " not supported with --interactive or other interactive options\n");
|
printf(" not supported with --interactive or other interactive options\n");
|
||||||
fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
|
printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
|
||||||
fprintf(stdout, " --random-prompt start with a randomized prompt.\n");
|
printf(" --random-prompt start with a randomized prompt.\n");
|
||||||
fprintf(stdout, " --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
|
printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
|
||||||
fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
||||||
fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
|
printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
|
||||||
fprintf(stdout, " -f FNAME, --file FNAME\n");
|
printf(" -f FNAME, --file FNAME\n");
|
||||||
fprintf(stdout, " prompt file to start generation.\n");
|
printf(" prompt file to start generation.\n");
|
||||||
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||||
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
|
||||||
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
|
||||||
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
|
||||||
fprintf(stdout, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
|
printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
|
||||||
fprintf(stdout, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
|
printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
|
||||||
fprintf(stdout, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
|
||||||
fprintf(stdout, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
|
printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
|
||||||
fprintf(stdout, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
|
printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
|
||||||
fprintf(stdout, " --mirostat N use Mirostat sampling.\n");
|
printf(" --mirostat N use Mirostat sampling.\n");
|
||||||
fprintf(stdout, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
|
||||||
fprintf(stdout, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
|
printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
|
||||||
fprintf(stdout, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
|
||||||
fprintf(stdout, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
|
||||||
fprintf(stdout, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
|
||||||
fprintf(stdout, " modifies the likelihood of token appearing in the completion,\n");
|
printf(" modifies the likelihood of token appearing in the completion,\n");
|
||||||
fprintf(stdout, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
|
||||||
fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
|
printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
|
||||||
fprintf(stdout, " --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
|
printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
|
||||||
fprintf(stdout, " --grammar-file FNAME file to read grammar from\n");
|
printf(" --grammar-file FNAME file to read grammar from\n");
|
||||||
fprintf(stdout, " --cfg-negative-prompt PROMPT\n");
|
printf(" --cfg-negative-prompt PROMPT\n");
|
||||||
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
|
printf(" negative prompt to use for guidance. (default: empty)\n");
|
||||||
fprintf(stdout, " --cfg-negative-prompt-file FNAME\n");
|
printf(" --cfg-negative-prompt-file FNAME\n");
|
||||||
fprintf(stdout, " negative prompt file to use for guidance. (default: empty)\n");
|
printf(" negative prompt file to use for guidance. (default: empty)\n");
|
||||||
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
|
||||||
fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
|
printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
|
||||||
fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
|
printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
|
||||||
fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
|
printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
|
||||||
fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
|
||||||
fprintf(stdout, " --no-penalize-nl do not penalize newline token\n");
|
printf(" --no-penalize-nl do not penalize newline token\n");
|
||||||
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||||
fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n");
|
printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
|
||||||
fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||||
fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
||||||
fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||||
fprintf(stdout, " --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
||||||
fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||||
if (llama_mlock_supported()) {
|
if (llama_mlock_supported()) {
|
||||||
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
}
|
}
|
||||||
if (llama_mmap_supported()) {
|
if (llama_mmap_supported()) {
|
||||||
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||||
}
|
}
|
||||||
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n");
|
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
||||||
fprintf(stdout, " if run without this previously, it is recommended to drop the system page cache before using this\n");
|
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
|
||||||
fprintf(stdout, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
|
printf(" -ngl N, --n-gpu-layers N\n");
|
||||||
fprintf(stdout, " number of layers to store in VRAM\n");
|
printf(" number of layers to store in VRAM\n");
|
||||||
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
|
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
|
printf(" -nommq, --no-mul-mat-q\n");
|
||||||
fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
|
printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
|
||||||
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
|
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
#endif
|
#endif
|
||||||
fprintf(stdout, " --mtest compute maximum memory usage\n");
|
printf(" --mtest compute maximum memory usage\n");
|
||||||
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
|
printf(" --export export the computation graph to 'llama.ggml'\n");
|
||||||
fprintf(stdout, " --verbose-prompt print prompt before generation\n");
|
printf(" --verbose-prompt print prompt before generation\n");
|
||||||
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||||
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||||
fprintf(stdout, " -m FNAME, --model FNAME\n");
|
printf(" -m FNAME, --model FNAME\n");
|
||||||
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
|
printf(" model path (default: %s)\n", params.model.c_str());
|
||||||
fprintf(stdout, " -md FNAME, --model-draft FNAME\n");
|
printf(" -md FNAME, --model-draft FNAME\n");
|
||||||
fprintf(stdout, " draft model for speculative decoding (default: %s)\n", params.model.c_str());
|
printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
|
||||||
fprintf(stdout, " -ld LOGDIR, --logdir LOGDIR\n");
|
printf(" -ld LOGDIR, --logdir LOGDIR\n");
|
||||||
fprintf(stdout, " path under which to save YAML logs (no logging if unset)\n");
|
printf(" path under which to save YAML logs (no logging if unset)\n");
|
||||||
fprintf(stdout, "\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string gpt_random_prompt(std::mt19937 & rng) {
|
std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||||
|
|
16
common/log.h
16
common/log.h
|
@ -513,16 +513,16 @@ inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string &
|
||||||
|
|
||||||
inline void log_print_usage()
|
inline void log_print_usage()
|
||||||
{
|
{
|
||||||
fprintf(stdout, "log options:\n");
|
printf("log options:\n");
|
||||||
/* format
|
/* format
|
||||||
fprintf(stdout, " -h, --help show this help message and exit\n");*/
|
printf(" -h, --help show this help message and exit\n");*/
|
||||||
/* spacing
|
/* spacing
|
||||||
fprintf(stdout, "__-param----------------Description\n");*/
|
printf("__-param----------------Description\n");*/
|
||||||
fprintf(stdout, " --log-test Run simple logging test\n");
|
printf(" --log-test Run simple logging test\n");
|
||||||
fprintf(stdout, " --log-disable Disable trace logs\n");
|
printf(" --log-disable Disable trace logs\n");
|
||||||
fprintf(stdout, " --log-enable Enable trace logs\n");
|
printf(" --log-enable Enable trace logs\n");
|
||||||
fprintf(stdout, " --log-file Specify a log filename (without extension)\n");
|
printf(" --log-file Specify a log filename (without extension)\n");
|
||||||
fprintf(stdout, " Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
|
printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
|
||||||
}
|
}
|
||||||
|
|
||||||
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
|
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)
|
||||||
|
|
|
@ -5,6 +5,7 @@ import argparse
|
||||||
import math
|
import math
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
|
from enum import IntEnum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -34,10 +35,35 @@ GGML_QUANT_SIZES = {
|
||||||
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
|
gguf.GGMLQuantizationType.Q8_K : (256, 4 + QK_K + QK_K // 8),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
class GGMLFormat(IntEnum):
|
||||||
|
GGML = 0
|
||||||
|
GGMF = 1
|
||||||
|
GGJT = 2
|
||||||
|
|
||||||
|
class GGMLFType(IntEnum):
|
||||||
|
ALL_F32 = 0
|
||||||
|
MOSTLY_F16 = 1
|
||||||
|
MOSTLY_Q4_0 = 2
|
||||||
|
MOSTLY_Q4_1 = 3
|
||||||
|
MOSTLY_Q4_1_SOME_F16 = 4
|
||||||
|
MOSTLY_Q8_0 = 7
|
||||||
|
MOSTLY_Q5_0 = 8
|
||||||
|
MOSTLY_Q5_1 = 9
|
||||||
|
MOSTLY_Q2_K = 10
|
||||||
|
MOSTLY_Q3_K_S = 11
|
||||||
|
MOSTLY_Q3_K_M = 12
|
||||||
|
MOSTLY_Q3_K_L = 13
|
||||||
|
MOSTLY_Q4_K_S = 14
|
||||||
|
MOSTLY_Q4_K_M = 15
|
||||||
|
MOSTLY_Q5_K_S = 16
|
||||||
|
MOSTLY_Q5_K_M = 17
|
||||||
|
MOSTLY_Q6_K = 18
|
||||||
|
|
||||||
class Hyperparameters:
|
class Hyperparameters:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.n_vocab = self.n_embd = self.n_mult = self.n_head = self.n_layer = self.n_rot = self.ftype = 0
|
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
||||||
self.n_ff = 0
|
self.n_layer = self.n_rot = self.n_ff = 0
|
||||||
|
self.ftype = GGMLFType.ALL_F32
|
||||||
|
|
||||||
def set_n_ff(self, model):
|
def set_n_ff(self, model):
|
||||||
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
|
ff_tensor_idx = model.tensor_map.get(b'layers.0.feed_forward.w1.weight')
|
||||||
|
@ -53,16 +79,21 @@ class Hyperparameters:
|
||||||
self.n_head,
|
self.n_head,
|
||||||
self.n_layer,
|
self.n_layer,
|
||||||
self.n_rot,
|
self.n_rot,
|
||||||
self.ftype,
|
ftype,
|
||||||
) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
|
) = struct.unpack('<7I', data[offset:offset + (4 * 7)])
|
||||||
|
try:
|
||||||
|
self.ftype = GGMLFType(ftype)
|
||||||
|
except ValueError:
|
||||||
|
raise ValueError(f'Invalid ftype {ftype}')
|
||||||
return 4 * 7
|
return 4 * 7
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype}>'
|
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
||||||
|
|
||||||
class Vocab:
|
class Vocab:
|
||||||
def __init__(self):
|
def __init__(self, load_scores = True):
|
||||||
self.items = []
|
self.items = []
|
||||||
|
self.load_scores = load_scores
|
||||||
|
|
||||||
def load(self, data, offset, n_vocab):
|
def load(self, data, offset, n_vocab):
|
||||||
orig_offset = offset
|
orig_offset = offset
|
||||||
|
@ -70,20 +101,24 @@ class Vocab:
|
||||||
itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
|
itemlen = struct.unpack('<I', data[offset:offset + 4])[0]
|
||||||
assert itemlen < 4096, 'Absurd vocab item length'
|
assert itemlen < 4096, 'Absurd vocab item length'
|
||||||
offset += 4
|
offset += 4
|
||||||
vocab = bytes(data[offset:offset + itemlen])
|
item_text = bytes(data[offset:offset + itemlen])
|
||||||
offset += itemlen
|
offset += itemlen
|
||||||
score = struct.unpack('<f', data[offset:offset + 4])[0]
|
if self.load_scores:
|
||||||
|
item_score = struct.unpack('<f', data[offset:offset + 4])[0]
|
||||||
offset += 4
|
offset += 4
|
||||||
self.items.append((vocab, score))
|
else:
|
||||||
|
item_score = 0.0
|
||||||
|
self.items.append((item_text, item_score))
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
class Tensor:
|
class Tensor:
|
||||||
def __init__(self):
|
def __init__(self, use_padding = True):
|
||||||
self.name = None
|
self.name = None
|
||||||
self.dims: tuple[int, ...] = ()
|
self.dims: tuple[int, ...] = ()
|
||||||
self.dtype = None
|
self.dtype = None
|
||||||
self.start_offset = 0
|
self.start_offset = 0
|
||||||
self.len_bytes = np.int64(0)
|
self.len_bytes = np.int64(0)
|
||||||
|
self.use_padding = use_padding
|
||||||
|
|
||||||
def load(self, data, offset):
|
def load(self, data, offset):
|
||||||
orig_offset = offset
|
orig_offset = offset
|
||||||
|
@ -99,7 +134,7 @@ class Tensor:
|
||||||
offset += 4 * n_dims
|
offset += 4 * n_dims
|
||||||
self.name = bytes(data[offset:offset + name_len])
|
self.name = bytes(data[offset:offset + name_len])
|
||||||
offset += name_len
|
offset += name_len
|
||||||
pad = ((offset + 31) & ~31) - offset
|
pad = ((offset + 31) & ~31) - offset if self.use_padding else 0
|
||||||
offset += pad
|
offset += pad
|
||||||
n_elems = np.prod(self.dims)
|
n_elems = np.prod(self.dims)
|
||||||
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
|
n_bytes = np.int64(np.int64(n_elems) * np.int64(tysize)) // np.int64(blksize)
|
||||||
|
@ -109,7 +144,7 @@ class Tensor:
|
||||||
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
class GGMLV3Model:
|
class GGMLModel:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.hyperparameters = None
|
self.hyperparameters = None
|
||||||
self.vocab = None
|
self.vocab = None
|
||||||
|
@ -117,20 +152,52 @@ class GGMLV3Model:
|
||||||
self.tensors = []
|
self.tensors = []
|
||||||
|
|
||||||
def validate_header(self, data, offset):
|
def validate_header(self, data, offset):
|
||||||
if bytes(data[offset:offset + 4]) != b'tjgg' or struct.unpack('<I', data[offset + 4:offset + 8])[0] != 3:
|
magic = bytes(data[offset:offset + 4])
|
||||||
raise ValueError('Only GGJTv3 supported')
|
if magic == b'GGUF':
|
||||||
|
raise ValueError('File is already in GGUF format.')
|
||||||
|
if magic == b'lmgg':
|
||||||
|
self.file_format = GGMLFormat.GGML
|
||||||
|
self.format_version = 1
|
||||||
|
return 4
|
||||||
|
version = struct.unpack('<I', data[offset + 4:offset + 8])[0]
|
||||||
|
if magic == b'fmgg':
|
||||||
|
if version != 1:
|
||||||
|
raise ValueError(f'Cannot handle unexpected GGMF file version {version}')
|
||||||
|
self.file_format = GGMLFormat.GGMF
|
||||||
|
self.format_version = version
|
||||||
return 8
|
return 8
|
||||||
|
if magic == b'tjgg':
|
||||||
|
if version < 1 or version > 3:
|
||||||
|
raise ValueError(f'Cannot handle unexpected GGJT file version {version}')
|
||||||
|
self.file_format = GGMLFormat.GGJT
|
||||||
|
self.format_version = version
|
||||||
|
return 8
|
||||||
|
raise ValueError(f"Unexpected file magic {magic!r}! This doesn't look like a GGML format file.")
|
||||||
|
|
||||||
|
def validate_conversion(self, ftype):
|
||||||
|
err = ''
|
||||||
|
if (self.file_format < GGMLFormat.GGJT or self.format_version < 2):
|
||||||
|
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
||||||
|
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
||||||
|
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
||||||
|
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
||||||
|
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
||||||
|
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
||||||
|
if len(err) > 0:
|
||||||
|
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
|
||||||
|
|
||||||
def load(self, data, offset):
|
def load(self, data, offset):
|
||||||
offset += self.validate_header(data, offset)
|
offset += self.validate_header(data, offset)
|
||||||
hp = Hyperparameters()
|
hp = Hyperparameters()
|
||||||
offset += hp.load(data, offset)
|
offset += hp.load(data, offset)
|
||||||
vocab = Vocab()
|
print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
|
||||||
|
self.validate_conversion(hp.ftype)
|
||||||
|
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
|
||||||
offset += vocab.load(data, offset, hp.n_vocab)
|
offset += vocab.load(data, offset, hp.n_vocab)
|
||||||
tensors: list[Tensor] = []
|
tensors: list[Tensor] = []
|
||||||
tensor_map = {}
|
tensor_map = {}
|
||||||
while offset < len(data):
|
while offset < len(data):
|
||||||
tensor = Tensor()
|
tensor = Tensor(use_padding = self.file_format > GGMLFormat.GGMF)
|
||||||
offset += tensor.load(data, offset)
|
offset += tensor.load(data, offset)
|
||||||
tensor_map[tensor.name] = len(tensors)
|
tensor_map[tensor.name] = len(tensors)
|
||||||
tensors.append(tensor)
|
tensors.append(tensor)
|
||||||
|
@ -168,7 +235,10 @@ class GGMLToGGUF:
|
||||||
|
|
||||||
def save(self):
|
def save(self):
|
||||||
print('* Preparing to save GGUF file')
|
print('* Preparing to save GGUF file')
|
||||||
gguf_writer = gguf.GGUFWriter(self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
gguf_writer = gguf.GGUFWriter(
|
||||||
|
self.cfg.output,
|
||||||
|
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
||||||
|
use_temp_file = False )
|
||||||
self.add_params(gguf_writer)
|
self.add_params(gguf_writer)
|
||||||
self.add_vocab(gguf_writer)
|
self.add_vocab(gguf_writer)
|
||||||
if self.special_vocab is not None:
|
if self.special_vocab is not None:
|
||||||
|
@ -185,7 +255,10 @@ class GGMLToGGUF:
|
||||||
def add_params(self, gguf_writer):
|
def add_params(self, gguf_writer):
|
||||||
hp = self.model.hyperparameters
|
hp = self.model.hyperparameters
|
||||||
cfg = self.cfg
|
cfg = self.cfg
|
||||||
desc = cfg.desc if cfg.desc is not None else 'converted from legacy GGJTv3 format'
|
if cfg.desc is not None:
|
||||||
|
desc = cfg.desc
|
||||||
|
else:
|
||||||
|
desc = f'converted from legacy {self.model.file_format.name}v{self.model.format_version} {hp.ftype.name} format'
|
||||||
try:
|
try:
|
||||||
# Filenames aren't necessarily valid UTF8.
|
# Filenames aren't necessarily valid UTF8.
|
||||||
name = cfg.name if cfg.name is not None else cfg.input.name
|
name = cfg.name if cfg.name is not None else cfg.input.name
|
||||||
|
@ -195,6 +268,7 @@ class GGMLToGGUF:
|
||||||
if name is not None:
|
if name is not None:
|
||||||
gguf_writer.add_name(name)
|
gguf_writer.add_name(name)
|
||||||
gguf_writer.add_description(desc)
|
gguf_writer.add_description(desc)
|
||||||
|
gguf_writer.add_file_type(int(hp.ftype))
|
||||||
if self.params_override is not None:
|
if self.params_override is not None:
|
||||||
po = self.params_override
|
po = self.params_override
|
||||||
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
|
assert po.n_embd == hp.n_embd, 'Model hyperparams mismatch'
|
||||||
|
@ -231,7 +305,8 @@ class GGMLToGGUF:
|
||||||
tokens.append(vbytes)
|
tokens.append(vbytes)
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
toktypes.append(ttype)
|
toktypes.append(ttype)
|
||||||
assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
|
assert len(tokens) == hp.n_vocab, \
|
||||||
|
f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
gguf_writer.add_token_scores(scores)
|
||||||
if len(toktypes) > 0:
|
if len(toktypes) > 0:
|
||||||
|
@ -283,7 +358,11 @@ class GGMLToGGUF:
|
||||||
tempdims[1] = tempdims[0]
|
tempdims[1] = tempdims[0]
|
||||||
tempdims[0] = temp
|
tempdims[0] = temp
|
||||||
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
|
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
|
||||||
gguf_writer.add_tensor(mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], raw_shape = tempdims, raw_dtype = tensor.dtype)
|
gguf_writer.add_tensor(
|
||||||
|
mapped_name,
|
||||||
|
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
||||||
|
raw_shape = tempdims,
|
||||||
|
raw_dtype = tensor.dtype )
|
||||||
|
|
||||||
def handle_metadata(cfg, hp):
|
def handle_metadata(cfg, hp):
|
||||||
import convert
|
import convert
|
||||||
|
@ -305,32 +384,46 @@ def handle_metadata(cfg, hp):
|
||||||
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
|
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
|
||||||
else:
|
else:
|
||||||
raise ValueError('Unable to load metadata')
|
raise ValueError('Unable to load metadata')
|
||||||
vocab = convert.load_vocab(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir, cfg.vocabtype)
|
vocab = convert.load_vocab(
|
||||||
|
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
||||||
|
cfg.vocabtype )
|
||||||
# FIXME: Respect cfg.vocab_dir?
|
# FIXME: Respect cfg.vocab_dir?
|
||||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
|
svocab = gguf.SpecialVocab(cfg.model_metadata_dir)
|
||||||
convert.check_vocab_size(params, vocab)
|
convert.check_vocab_size(params, vocab)
|
||||||
return (params, vocab, svocab)
|
return (params, vocab, svocab)
|
||||||
|
|
||||||
def handle_args():
|
def handle_args():
|
||||||
parser = argparse.ArgumentParser(description = 'Convert GGMLv3 models to GGUF')
|
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
||||||
parser.add_argument('--input', '-i', type = Path, required = True, help = 'Input GGMLv3 filename')
|
parser.add_argument('--input', '-i', type = Path, required = True,
|
||||||
parser.add_argument('--output', '-o', type = Path, required = True, help ='Output GGUF filename')
|
help = 'Input GGMLv3 filename')
|
||||||
parser.add_argument('--name', help = 'Set model name')
|
parser.add_argument('--output', '-o', type = Path, required = True,
|
||||||
parser.add_argument('--desc', help = 'Set model description')
|
help ='Output GGUF filename')
|
||||||
parser.add_argument('--gqa', type = int, default = 1, help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
parser.add_argument('--name',
|
||||||
parser.add_argument('--eps', default = '5.0e-06', help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
help = 'Set model name')
|
||||||
parser.add_argument('--context-length', '-c', type=int, default = 2048, help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
parser.add_argument('--desc',
|
||||||
parser.add_argument('--model-metadata-dir', '-m', type = Path, help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
help = 'Set model description')
|
||||||
parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
parser.add_argument('--gqa', type = int, default = 1,
|
||||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)", default="spm")
|
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
||||||
|
parser.add_argument('--eps', default = '5.0e-06',
|
||||||
|
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
||||||
|
parser.add_argument('--context-length', '-c', type=int, default = 2048,
|
||||||
|
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
||||||
|
parser.add_argument('--model-metadata-dir', '-m', type = Path,
|
||||||
|
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
||||||
|
parser.add_argument("--vocab-dir", type=Path,
|
||||||
|
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
||||||
|
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
|
||||||
|
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cfg = handle_args()
|
cfg = handle_args()
|
||||||
print(f'* Using config: {cfg}')
|
print(f'* Using config: {cfg}')
|
||||||
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
|
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
|
||||||
|
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
|
||||||
|
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
|
||||||
data = np.memmap(cfg.input, mode = 'r')
|
data = np.memmap(cfg.input, mode = 'r')
|
||||||
model = GGMLV3Model()
|
model = GGMLModel()
|
||||||
print('* Scanning GGML input file')
|
print('* Scanning GGML input file')
|
||||||
offset = model.load(data, 0)
|
offset = model.load(data, 0)
|
||||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||||
|
@ -345,7 +438,12 @@ def main():
|
||||||
print(f'* Special vocab: {special_vocab}')
|
print(f'* Special vocab: {special_vocab}')
|
||||||
else:
|
else:
|
||||||
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||||
converter = GGMLToGGUF(model, data, cfg, params_override = params_override, vocab_override = vocab_override, special_vocab = special_vocab)
|
if model.file_format == GGMLFormat.GGML:
|
||||||
|
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
||||||
|
converter = GGMLToGGUF(model, data, cfg,
|
||||||
|
params_override = params_override,
|
||||||
|
vocab_override = vocab_override,
|
||||||
|
special_vocab = special_vocab )
|
||||||
converter.save()
|
converter.save()
|
||||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||||
|
|
|
@ -673,7 +673,7 @@ class LazyUnpickler(pickle.Unpickler):
|
||||||
assert isinstance(pid[1], LazyStorageKind)
|
assert isinstance(pid[1], LazyStorageKind)
|
||||||
data_type = pid[1].data_type
|
data_type = pid[1].data_type
|
||||||
filename_stem = pid[2]
|
filename_stem = pid[2]
|
||||||
filename = self.data_base_path + '/' + filename_stem
|
filename = f'{self.data_base_path}/{filename_stem}'
|
||||||
info = self.zip_file.getinfo(filename)
|
info = self.zip_file.getinfo(filename)
|
||||||
|
|
||||||
def load(offset: int, elm_count: int) -> NDArray:
|
def load(offset: int, elm_count: int) -> NDArray:
|
||||||
|
@ -689,7 +689,6 @@ class LazyUnpickler(pickle.Unpickler):
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
|
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
|
||||||
# pyright: ignore[reportSelfClsParameterName]
|
|
||||||
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
|
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
|
||||||
assert isinstance(storage, LazyStorage)
|
assert isinstance(storage, LazyStorage)
|
||||||
|
|
||||||
|
|
|
@ -76,7 +76,7 @@ bool gguf_ex_write(const std::string & fname) {
|
||||||
|
|
||||||
gguf_write_to_file(ctx, fname.c_str(), false);
|
gguf_write_to_file(ctx, fname.c_str(), false);
|
||||||
|
|
||||||
fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
|
printf("%s: wrote file '%s;\n", __func__, fname.c_str());
|
||||||
|
|
||||||
ggml_free(ctx_data);
|
ggml_free(ctx_data);
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
|
@ -93,20 +93,20 @@ bool gguf_ex_read_0(const std::string & fname) {
|
||||||
|
|
||||||
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
||||||
|
|
||||||
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx));
|
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
|
||||||
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||||
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
|
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
|
||||||
|
|
||||||
// kv
|
// kv
|
||||||
{
|
{
|
||||||
const int n_kv = gguf_get_n_kv(ctx);
|
const int n_kv = gguf_get_n_kv(ctx);
|
||||||
|
|
||||||
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
|
printf("%s: n_kv: %d\n", __func__, n_kv);
|
||||||
|
|
||||||
for (int i = 0; i < n_kv; ++i) {
|
for (int i = 0; i < n_kv; ++i) {
|
||||||
const char * key = gguf_get_key(ctx, i);
|
const char * key = gguf_get_key(ctx, i);
|
||||||
|
|
||||||
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
|
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -116,10 +116,10 @@ bool gguf_ex_read_0(const std::string & fname) {
|
||||||
|
|
||||||
const int keyidx = gguf_find_key(ctx, findkey);
|
const int keyidx = gguf_find_key(ctx, findkey);
|
||||||
if (keyidx == -1) {
|
if (keyidx == -1) {
|
||||||
fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
|
printf("%s: find key: %s not found.\n", __func__, findkey);
|
||||||
} else {
|
} else {
|
||||||
const char * key_value = gguf_get_val_str(ctx, keyidx);
|
const char * key_value = gguf_get_val_str(ctx, keyidx);
|
||||||
fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
|
printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -127,13 +127,13 @@ bool gguf_ex_read_0(const std::string & fname) {
|
||||||
{
|
{
|
||||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
|
|
||||||
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
|
printf("%s: n_tensors: %d\n", __func__, n_tensors);
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name (ctx, i);
|
const char * name = gguf_get_tensor_name (ctx, i);
|
||||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||||
|
|
||||||
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -153,20 +153,20 @@ bool gguf_ex_read_1(const std::string & fname) {
|
||||||
|
|
||||||
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
|
||||||
|
|
||||||
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx));
|
printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
|
||||||
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
|
||||||
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
|
printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
|
||||||
|
|
||||||
// kv
|
// kv
|
||||||
{
|
{
|
||||||
const int n_kv = gguf_get_n_kv(ctx);
|
const int n_kv = gguf_get_n_kv(ctx);
|
||||||
|
|
||||||
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
|
printf("%s: n_kv: %d\n", __func__, n_kv);
|
||||||
|
|
||||||
for (int i = 0; i < n_kv; ++i) {
|
for (int i = 0; i < n_kv; ++i) {
|
||||||
const char * key = gguf_get_key(ctx, i);
|
const char * key = gguf_get_key(ctx, i);
|
||||||
|
|
||||||
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
|
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -174,13 +174,13 @@ bool gguf_ex_read_1(const std::string & fname) {
|
||||||
{
|
{
|
||||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
|
|
||||||
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
|
printf("%s: n_tensors: %d\n", __func__, n_tensors);
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name (ctx, i);
|
const char * name = gguf_get_tensor_name (ctx, i);
|
||||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||||
|
|
||||||
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -189,13 +189,13 @@ bool gguf_ex_read_1(const std::string & fname) {
|
||||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
|
printf("%s: reading tensor %d data\n", __func__, i);
|
||||||
|
|
||||||
const char * name = gguf_get_tensor_name(ctx, i);
|
const char * name = gguf_get_tensor_name(ctx, i);
|
||||||
|
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||||
|
|
||||||
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
|
printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
|
||||||
|
|
||||||
// print first 10 elements
|
// print first 10 elements
|
||||||
const float * data = (const float *) cur->data;
|
const float * data = (const float *) cur->data;
|
||||||
|
@ -219,7 +219,7 @@ bool gguf_ex_read_1(const std::string & fname) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
|
printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
|
||||||
|
|
||||||
ggml_free(ctx_data);
|
ggml_free(ctx_data);
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
|
@ -229,7 +229,7 @@ bool gguf_ex_read_1(const std::string & fname) {
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
|
printf("usage: %s data.gguf r|w\n", argv[0]);
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -305,9 +305,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)
|
||||||
|
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
|
||||||
if( cur == NULL ) {
|
if( cur == NULL ) {
|
||||||
fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str());
|
printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
|
||||||
} else {
|
} else {
|
||||||
// fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
|
// printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
|
||||||
}
|
}
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
|
@ -333,21 +333,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stdout, "%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
|
printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
|
||||||
fprintf(stdout, "%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
|
printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
|
||||||
fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
|
printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
|
||||||
|
|
||||||
// print all kv
|
// print all kv
|
||||||
#if 0
|
#if 0
|
||||||
{
|
{
|
||||||
const int n_kv = gguf_get_n_kv(ggufctx);
|
const int n_kv = gguf_get_n_kv(ggufctx);
|
||||||
|
|
||||||
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
|
printf("%s: n_kv: %d\n", __func__, n_kv);
|
||||||
|
|
||||||
for (int i = 0; i < n_kv; ++i) {
|
for (int i = 0; i < n_kv; ++i) {
|
||||||
const char * key = gguf_get_key(ggufctx, i);
|
const char * key = gguf_get_key(ggufctx, i);
|
||||||
|
|
||||||
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
|
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -357,21 +357,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
|
||||||
int keyidx;
|
int keyidx;
|
||||||
|
|
||||||
keyidx = gguf_find_key(ggufctx, "general.name");
|
keyidx = gguf_find_key(ggufctx, "general.name");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.description");
|
keyidx = gguf_find_key(ggufctx, "general.description");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.author");
|
keyidx = gguf_find_key(ggufctx, "general.author");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.license");
|
keyidx = gguf_find_key(ggufctx, "general.license");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.architecture");
|
keyidx = gguf_find_key(ggufctx, "general.architecture");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.file_type");
|
keyidx = gguf_find_key(ggufctx, "general.file_type");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
|
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
|
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
}
|
}
|
||||||
|
|
||||||
// check required metadata
|
// check required metadata
|
||||||
|
@ -382,11 +382,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
|
||||||
keyidx = gguf_find_key(ggufctx, "general.architecture");
|
keyidx = gguf_find_key(ggufctx, "general.architecture");
|
||||||
if (keyidx != -1) {
|
if (keyidx != -1) {
|
||||||
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) {
|
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) {
|
||||||
fprintf(stdout, "%s: model architecture not supported!\n", __func__);
|
printf("%s: model architecture not supported!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "%s: gguf model architecture not found!\n", __func__);
|
printf("%s: gguf model architecture not found!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -394,11 +394,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
|
||||||
keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout");
|
keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout");
|
||||||
if (keyidx != -1) {
|
if (keyidx != -1) {
|
||||||
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) {
|
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) {
|
||||||
fprintf(stdout, "%s: model tensor data layout not supported!\n", __func__);
|
printf("%s: model tensor data layout not supported!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "%s: gguf model tensor data layout not found!\n", __func__);
|
printf("%s: gguf model tensor data layout not found!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -455,11 +455,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
|
||||||
|
|
||||||
if (keyidx != -1) {
|
if (keyidx != -1) {
|
||||||
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
|
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
|
||||||
fprintf(stdout, "%s: tokenizer model not supported!\n", __func__);
|
printf("%s: tokenizer model not supported!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "%s: tokenizer model not found!\n", __func__);
|
printf("%s: tokenizer model not found!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -467,22 +467,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
|
||||||
int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
|
int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
|
||||||
|
|
||||||
if (tokens_keyidx == -1) {
|
if (tokens_keyidx == -1) {
|
||||||
fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__);
|
printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
|
int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
|
||||||
|
|
||||||
if (merges_keyidx == -1) {
|
if (merges_keyidx == -1) {
|
||||||
fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__);
|
printf("%s: gpt2 tokenizer merges not found!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
|
hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
|
||||||
hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
|
hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
|
||||||
|
|
||||||
fprintf(stdout, "%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
|
printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
|
||||||
fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
|
printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
|
||||||
|
|
||||||
for (size_t i = 0; i < hparams.n_vocab; i++) {
|
for (size_t i = 0; i < hparams.n_vocab; i++) {
|
||||||
std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
|
std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
|
||||||
|
@ -523,12 +523,12 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
|
||||||
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
|
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
|
||||||
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
|
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
|
||||||
|
|
||||||
if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
|
if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
|
||||||
if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
|
if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
|
||||||
if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
|
if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
|
||||||
if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
|
if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
|
||||||
if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
|
if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
|
||||||
if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
|
if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -543,13 +543,13 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
|
||||||
{
|
{
|
||||||
const int n_tensors = gguf_get_n_tensors(ggufctx);
|
const int n_tensors = gguf_get_n_tensors(ggufctx);
|
||||||
|
|
||||||
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
|
printf("%s: n_tensors: %d\n", __func__, n_tensors);
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name (ggufctx, i);
|
const char * name = gguf_get_tensor_name (ggufctx, i);
|
||||||
const size_t offset = gguf_get_tensor_offset(ggufctx, i);
|
const size_t offset = gguf_get_tensor_offset(ggufctx, i);
|
||||||
|
|
||||||
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -318,9 +318,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)
|
||||||
|
|
||||||
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
|
||||||
if( cur == NULL ) {
|
if( cur == NULL ) {
|
||||||
fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str());
|
printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
|
||||||
} else {
|
} else {
|
||||||
// fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
|
// printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
|
||||||
}
|
}
|
||||||
|
|
||||||
return cur;
|
return cur;
|
||||||
|
@ -346,21 +346,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stdout, "%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
|
printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
|
||||||
fprintf(stdout, "%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
|
printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
|
||||||
fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
|
printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
|
||||||
|
|
||||||
// print all kv
|
// print all kv
|
||||||
#if 0
|
#if 0
|
||||||
{
|
{
|
||||||
const int n_kv = gguf_get_n_kv(ggufctx);
|
const int n_kv = gguf_get_n_kv(ggufctx);
|
||||||
|
|
||||||
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
|
printf("%s: n_kv: %d\n", __func__, n_kv);
|
||||||
|
|
||||||
for (int i = 0; i < n_kv; ++i) {
|
for (int i = 0; i < n_kv; ++i) {
|
||||||
const char * key = gguf_get_key(ggufctx, i);
|
const char * key = gguf_get_key(ggufctx, i);
|
||||||
|
|
||||||
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
|
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -370,21 +370,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
|
||||||
int keyidx;
|
int keyidx;
|
||||||
|
|
||||||
keyidx = gguf_find_key(ggufctx, "general.name");
|
keyidx = gguf_find_key(ggufctx, "general.name");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.description");
|
keyidx = gguf_find_key(ggufctx, "general.description");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.author");
|
keyidx = gguf_find_key(ggufctx, "general.author");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.license");
|
keyidx = gguf_find_key(ggufctx, "general.license");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.architecture");
|
keyidx = gguf_find_key(ggufctx, "general.architecture");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.file_type");
|
keyidx = gguf_find_key(ggufctx, "general.file_type");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
|
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
|
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
|
||||||
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
|
||||||
}
|
}
|
||||||
|
|
||||||
// check required metadata
|
// check required metadata
|
||||||
|
@ -395,11 +395,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
|
||||||
keyidx = gguf_find_key(ggufctx, "general.architecture");
|
keyidx = gguf_find_key(ggufctx, "general.architecture");
|
||||||
if (keyidx != -1) {
|
if (keyidx != -1) {
|
||||||
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) {
|
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) {
|
||||||
fprintf(stdout, "%s: model architecture not supported!\n", __func__);
|
printf("%s: model architecture not supported!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "%s: gguf model architecture not found!\n", __func__);
|
printf("%s: gguf model architecture not found!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -456,11 +456,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
|
||||||
|
|
||||||
if (keyidx != -1) {
|
if (keyidx != -1) {
|
||||||
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
|
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
|
||||||
fprintf(stdout, "%s: tokenizer model not supported!\n", __func__);
|
printf("%s: tokenizer model not supported!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
fprintf(stdout, "%s: tokenizer model not found!\n", __func__);
|
printf("%s: tokenizer model not found!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -468,22 +468,22 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
|
||||||
int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
|
int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
|
||||||
|
|
||||||
if (tokens_keyidx == -1) {
|
if (tokens_keyidx == -1) {
|
||||||
fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__);
|
printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
|
int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
|
||||||
|
|
||||||
if (merges_keyidx == -1) {
|
if (merges_keyidx == -1) {
|
||||||
fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__);
|
printf("%s: gpt2 tokenizer merges not found!\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
|
hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
|
||||||
hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
|
hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
|
||||||
|
|
||||||
fprintf(stdout, "%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
|
printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
|
||||||
fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
|
printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
|
||||||
|
|
||||||
for (size_t i = 0; i < hparams.n_vocab; i++) {
|
for (size_t i = 0; i < hparams.n_vocab; i++) {
|
||||||
std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
|
std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
|
||||||
|
@ -524,12 +524,12 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
|
||||||
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
|
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
|
||||||
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
|
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
|
||||||
|
|
||||||
if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
|
if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
|
||||||
if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
|
if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
|
||||||
if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
|
if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
|
||||||
if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
|
if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
|
||||||
if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
|
if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
|
||||||
if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
|
if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -543,13 +543,13 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
|
||||||
{
|
{
|
||||||
const int n_tensors = gguf_get_n_tensors(ggufctx);
|
const int n_tensors = gguf_get_n_tensors(ggufctx);
|
||||||
|
|
||||||
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
|
printf("%s: n_tensors: %d\n", __func__, n_tensors);
|
||||||
|
|
||||||
for (int i = 0; i < n_tensors; ++i) {
|
for (int i = 0; i < n_tensors; ++i) {
|
||||||
const char * name = gguf_get_tensor_name (ggufctx, i);
|
const char * name = gguf_get_tensor_name (ggufctx, i);
|
||||||
const size_t offset = gguf_get_tensor_offset(ggufctx, i);
|
const size_t offset = gguf_get_tensor_offset(ggufctx, i);
|
||||||
|
|
||||||
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -165,26 +165,26 @@ static const cmd_params cmd_params_defaults = {
|
||||||
};
|
};
|
||||||
|
|
||||||
static void print_usage(int /* argc */, char ** argv) {
|
static void print_usage(int /* argc */, char ** argv) {
|
||||||
fprintf(stdout, "usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
fprintf(stdout, "\n");
|
printf("\n");
|
||||||
fprintf(stdout, "options:\n");
|
printf("options:\n");
|
||||||
fprintf(stdout, " -h, --help\n");
|
printf(" -h, --help\n");
|
||||||
fprintf(stdout, " -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
||||||
fprintf(stdout, " -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||||
fprintf(stdout, " -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||||
fprintf(stdout, " -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
|
||||||
fprintf(stdout, " --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
|
printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
|
||||||
fprintf(stdout, " -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
|
||||||
fprintf(stdout, " -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
fprintf(stdout, " -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
fprintf(stdout, " -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
|
printf(" -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
|
||||||
fprintf(stdout, " -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
|
printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
|
||||||
fprintf(stdout, " -ts, --tensor_split <ts0/ts1/..> \n");
|
printf(" -ts, --tensor_split <ts0/ts1/..> \n");
|
||||||
fprintf(stdout, " -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||||
fprintf(stdout, " -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
|
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
|
||||||
fprintf(stdout, " -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||||
fprintf(stdout, "\n");
|
printf("\n");
|
||||||
fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -118,7 +118,7 @@ static void server_log(const char *level, const char *function, int line,
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
|
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||||
fprintf(stdout, "%.*s\n", (int)str.size(), str.data());
|
printf("%.*s\n", (int)str.size(), str.data());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -694,50 +694,50 @@ struct llama_server_context
|
||||||
static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
const server_params &sparams)
|
const server_params &sparams)
|
||||||
{
|
{
|
||||||
fprintf(stdout, "usage: %s [options]\n", argv0);
|
printf("usage: %s [options]\n", argv0);
|
||||||
fprintf(stdout, "\n");
|
printf("\n");
|
||||||
fprintf(stdout, "options:\n");
|
printf("options:\n");
|
||||||
fprintf(stdout, " -h, --help show this help message and exit\n");
|
printf(" -h, --help show this help message and exit\n");
|
||||||
fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||||
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
|
||||||
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
|
||||||
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||||
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
|
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||||
if (llama_mlock_supported())
|
if (llama_mlock_supported())
|
||||||
{
|
{
|
||||||
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||||
}
|
}
|
||||||
if (llama_mmap_supported())
|
if (llama_mmap_supported())
|
||||||
{
|
{
|
||||||
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||||
}
|
}
|
||||||
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n");
|
printf(" --numa attempt optimizations that help on some NUMA systems\n");
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
|
printf(" -ngl N, --n-gpu-layers N\n");
|
||||||
fprintf(stdout, " number of layers to store in VRAM\n");
|
printf(" number of layers to store in VRAM\n");
|
||||||
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
|
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||||
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
|
printf(" -nommq, --no-mul-mat-q\n");
|
||||||
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
|
printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
|
||||||
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
|
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
||||||
#endif
|
#endif
|
||||||
fprintf(stdout, " -m FNAME, --model FNAME\n");
|
printf(" -m FNAME, --model FNAME\n");
|
||||||
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
|
printf(" model path (default: %s)\n", params.model.c_str());
|
||||||
fprintf(stdout, " -a ALIAS, --alias ALIAS\n");
|
printf(" -a ALIAS, --alias ALIAS\n");
|
||||||
fprintf(stdout, " set an alias for the model, will be added as `model` field in completion response\n");
|
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
||||||
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||||
fprintf(stdout, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
|
||||||
fprintf(stdout, " --port PORT port to listen (default (default: %d)\n", sparams.port);
|
printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
|
||||||
fprintf(stdout, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
|
printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
|
||||||
fprintf(stdout, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
|
||||||
fprintf(stdout, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
|
||||||
fprintf(stdout, "\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void server_params_parse(int argc, char **argv, server_params &sparams,
|
static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
|
@ -1595,7 +1595,7 @@ int main(int argc, char **argv)
|
||||||
svr.set_base_dir(sparams.public_path);
|
svr.set_base_dir(sparams.public_path);
|
||||||
|
|
||||||
// to make it ctrl+clickable:
|
// to make it ctrl+clickable:
|
||||||
fprintf(stdout, "\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
||||||
|
|
||||||
LOG_INFO("HTTP server listening", {
|
LOG_INFO("HTTP server listening", {
|
||||||
{"hostname", sparams.hostname},
|
{"hostname", sparams.hostname},
|
||||||
|
|
|
@ -1089,6 +1089,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
||||||
if (!max_abs_scale) {
|
if (!max_abs_scale) {
|
||||||
memset(&y[i], 0, sizeof(block_q6_K));
|
memset(&y[i], 0, sizeof(block_q6_K));
|
||||||
y[i].d = ggml_fp32_to_fp16(0.f);
|
y[i].d = ggml_fp32_to_fp16(0.f);
|
||||||
|
x += QK_K;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue