Merge pull request #1 from ggerganov/master

Merges latest llamacpp to the fork.
This commit is contained in:
Marc 2023-09-05 17:06:24 -05:00 committed by GitHub
commit 9d1a41a966
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 393 additions and 281 deletions

View file

@ -109,12 +109,11 @@ endif
ifdef LLAMA_CODE_COVERAGE ifdef LLAMA_CODE_COVERAGE
CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase '' MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
endif endif
ifdef LLAMA_DISABLE_LOGS ifdef LLAMA_DISABLE_LOGS
CFLAGS += -DLOG_DISABLE_LOGS MK_CPPFLAGS += -DLOG_DISABLE_LOGS
CXXFLAGS += -DLOG_DISABLE_LOGS
endif # LLAMA_DISABLE_LOGS endif # LLAMA_DISABLE_LOGS
# warnings # warnings
@ -124,7 +123,7 @@ MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-m
ifeq '' '$(findstring clang++,$(CXX))' ifeq '' '$(findstring clang++,$(CXX))'
# g++ only # g++ only
CXXFLAGS += -Wno-format-truncation MK_CXXFLAGS += -Wno-format-truncation
endif endif
# OS specific # OS specific
@ -188,8 +187,8 @@ endif
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
# https://github.com/ggerganov/llama.cpp/issues/2922 # https://github.com/ggerganov/llama.cpp/issues/2922
ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))' ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
CFLAGS += -Xassembler -muse-unaligned-vector-move MK_CFLAGS += -Xassembler -muse-unaligned-vector-move
CXXFLAGS += -Xassembler -muse-unaligned-vector-move MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
endif endif
ifneq ($(filter aarch64%,$(UNAME_M)),) ifneq ($(filter aarch64%,$(UNAME_M)),)
@ -226,8 +225,8 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
endif endif
else else
CFLAGS += -march=rv64gcv -mabi=lp64d MK_CFLAGS += -march=rv64gcv -mabi=lp64d
CXXFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif endif
ifndef LLAMA_NO_K_QUANTS ifndef LLAMA_NO_K_QUANTS
@ -247,16 +246,6 @@ ifndef LLAMA_NO_ACCELERATE
endif endif
endif # LLAMA_NO_ACCELERATE endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_METAL
# By default - use GPU acceleration on Mac OS
ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
CXXFLAGS += -DGGML_USE_METAL
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o
endif
endif # LLAMA_METAL
ifdef LLAMA_MPI ifdef LLAMA_MPI
MK_CPPFLAGS += -DGGML_USE_MPI MK_CPPFLAGS += -DGGML_USE_MPI
MK_CFLAGS += -Wno-cast-qual MK_CFLAGS += -Wno-cast-qual
@ -368,7 +357,7 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
endif # LLAMA_HIPBLAS endif # LLAMA_HIPBLAS
ifdef LLAMA_METAL ifdef LLAMA_METAL
MK_CPPFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG MK_CPPFLAGS += -DGGML_USE_METAL
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
OBJS += ggml-metal.o OBJS += ggml-metal.o
endif # LLAMA_METAL endif # LLAMA_METAL
@ -495,7 +484,7 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS) beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o $(OBJS) speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
ifdef LLAMA_METAL ifdef LLAMA_METAL

View file

@ -584,109 +584,109 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} }
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, "usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
fprintf(stdout, "\n"); printf("\n");
fprintf(stdout, "options:\n"); printf("options:\n");
fprintf(stdout, " -h, --help show this help message and exit\n"); printf(" -h, --help show this help message and exit\n");
fprintf(stdout, " -i, --interactive run in interactive mode\n"); printf(" -i, --interactive run in interactive mode\n");
fprintf(stdout, " --interactive-first run in interactive mode and wait for input right away\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n");
fprintf(stdout, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
fprintf(stdout, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
fprintf(stdout, " -r PROMPT, --reverse-prompt PROMPT\n"); printf(" -r PROMPT, --reverse-prompt PROMPT\n");
fprintf(stdout, " halt generation at PROMPT, return control in interactive mode\n"); printf(" halt generation at PROMPT, return control in interactive mode\n");
fprintf(stdout, " (can be specified more than once for multiple prompts).\n"); printf(" (can be specified more than once for multiple prompts).\n");
fprintf(stdout, " --color colorise output to distinguish prompt and user input from generations\n"); printf(" --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -p PROMPT, --prompt PROMPT\n"); printf(" -p PROMPT, --prompt PROMPT\n");
fprintf(stdout, " prompt to start generation with (default: empty)\n"); printf(" prompt to start generation with (default: empty)\n");
fprintf(stdout, " -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
fprintf(stdout, " not supported with --interactive or other interactive options\n"); printf(" not supported with --interactive or other interactive options\n");
fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
fprintf(stdout, " --random-prompt start with a randomized prompt.\n"); printf(" --random-prompt start with a randomized prompt.\n");
fprintf(stdout, " --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n"); printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
fprintf(stdout, " -f FNAME, --file FNAME\n"); printf(" -f FNAME, --file FNAME\n");
fprintf(stdout, " prompt file to start generation.\n"); printf(" prompt file to start generation.\n");
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
fprintf(stdout, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
fprintf(stdout, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
fprintf(stdout, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
fprintf(stdout, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
fprintf(stdout, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
fprintf(stdout, " --mirostat N use Mirostat sampling.\n"); printf(" --mirostat N use Mirostat sampling.\n");
fprintf(stdout, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
fprintf(stdout, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
fprintf(stdout, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
fprintf(stdout, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
fprintf(stdout, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
fprintf(stdout, " modifies the likelihood of token appearing in the completion,\n"); printf(" modifies the likelihood of token appearing in the completion,\n");
fprintf(stdout, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
fprintf(stdout, " --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n"); printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
fprintf(stdout, " --grammar-file FNAME file to read grammar from\n"); printf(" --grammar-file FNAME file to read grammar from\n");
fprintf(stdout, " --cfg-negative-prompt PROMPT\n"); printf(" --cfg-negative-prompt PROMPT\n");
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); printf(" negative prompt to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-negative-prompt-file FNAME\n"); printf(" --cfg-negative-prompt-file FNAME\n");
fprintf(stdout, " negative prompt file to use for guidance. (default: empty)\n"); printf(" negative prompt file to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale); printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale); printf(" --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stdout, " --no-penalize-nl do not penalize newline token\n"); printf(" --no-penalize-nl do not penalize newline token\n");
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp); printf(" --temp N temperature (default: %.1f)\n", (double)params.temp);
fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n"); printf(" --perplexity compute perplexity over each ctx window of the prompt\n");
fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
fprintf(stdout, " --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft); printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
if (llama_mlock_supported()) { if (llama_mlock_supported()) {
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }
if (llama_mmap_supported()) { if (llama_mmap_supported()) {
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n"); printf(" --numa attempt optimizations that help on some NUMA systems\n");
fprintf(stdout, " if run without this previously, it is recommended to drop the system page cache before using this\n"); printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
fprintf(stdout, " see https://github.com/ggerganov/llama.cpp/issues/1437\n"); printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); printf(" -ngl N, --n-gpu-layers N\n");
fprintf(stdout, " number of layers to store in VRAM\n"); printf(" number of layers to store in VRAM\n");
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); printf(" -ts SPLIT --tensor-split SPLIT\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
fprintf(stdout, " -nommq, --no-mul-mat-q\n"); printf(" -nommq, --no-mul-mat-q\n");
fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
#endif #endif
fprintf(stdout, " --mtest compute maximum memory usage\n"); printf(" --mtest compute maximum memory usage\n");
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n"); printf(" --export export the computation graph to 'llama.ggml'\n");
fprintf(stdout, " --verbose-prompt print prompt before generation\n"); printf(" --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n"); fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stdout, " -m FNAME, --model FNAME\n"); printf(" -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); printf(" model path (default: %s)\n", params.model.c_str());
fprintf(stdout, " -md FNAME, --model-draft FNAME\n"); printf(" -md FNAME, --model-draft FNAME\n");
fprintf(stdout, " draft model for speculative decoding (default: %s)\n", params.model.c_str()); printf(" draft model for speculative decoding (default: %s)\n", params.model.c_str());
fprintf(stdout, " -ld LOGDIR, --logdir LOGDIR\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n");
fprintf(stdout, " path under which to save YAML logs (no logging if unset)\n"); printf(" path under which to save YAML logs (no logging if unset)\n");
fprintf(stdout, "\n"); printf("\n");
} }
std::string gpt_random_prompt(std::mt19937 & rng) { std::string gpt_random_prompt(std::mt19937 & rng) {
@ -772,7 +772,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
{ {
LOG("warming up the model with an empty run\n"); LOG("warming up the model with an empty run\n");
const std::vector<llama_token> tmp = { llama_token_bos(lctx), }; const std::vector<llama_token> tmp = { llama_token_bos(lctx), llama_token_eos(lctx), };
llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads); llama_eval(lctx, tmp.data(), tmp.size(), 0, params.n_threads);
llama_reset_timings(lctx); llama_reset_timings(lctx);
} }

View file

@ -513,16 +513,16 @@ inline bool log_param_pair_parse(bool check_but_dont_parse, const std::string &
inline void log_print_usage() inline void log_print_usage()
{ {
fprintf(stdout, "log options:\n"); printf("log options:\n");
/* format /* format
fprintf(stdout, " -h, --help show this help message and exit\n");*/ printf(" -h, --help show this help message and exit\n");*/
/* spacing /* spacing
fprintf(stdout, "__-param----------------Description\n");*/ printf("__-param----------------Description\n");*/
fprintf(stdout, " --log-test Run simple logging test\n"); printf(" --log-test Run simple logging test\n");
fprintf(stdout, " --log-disable Disable trace logs\n"); printf(" --log-disable Disable trace logs\n");
fprintf(stdout, " --log-enable Enable trace logs\n"); printf(" --log-enable Enable trace logs\n");
fprintf(stdout, " --log-file Specify a log filename (without extension)\n"); printf(" --log-file Specify a log filename (without extension)\n");
fprintf(stdout, " Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */ printf(" Log file will be tagged with unique ID and written as \"<name>.<ID>.log\"\n"); /* */
} }
#define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv)

View file

@ -673,7 +673,7 @@ class LazyUnpickler(pickle.Unpickler):
assert isinstance(pid[1], LazyStorageKind) assert isinstance(pid[1], LazyStorageKind)
data_type = pid[1].data_type data_type = pid[1].data_type
filename_stem = pid[2] filename_stem = pid[2]
filename = self.data_base_path + '/' + filename_stem filename = f'{self.data_base_path}/{filename_stem}'
info = self.zip_file.getinfo(filename) info = self.zip_file.getinfo(filename)
def load(offset: int, elm_count: int) -> NDArray: def load(offset: int, elm_count: int) -> NDArray:
@ -689,7 +689,6 @@ class LazyUnpickler(pickle.Unpickler):
@staticmethod @staticmethod
def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any, def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,
# pyright: ignore[reportSelfClsParameterName]
requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor: requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
assert isinstance(storage, LazyStorage) assert isinstance(storage, LazyStorage)

View file

@ -76,7 +76,7 @@ bool gguf_ex_write(const std::string & fname) {
gguf_write_to_file(ctx, fname.c_str(), false); gguf_write_to_file(ctx, fname.c_str(), false);
fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str()); printf("%s: wrote file '%s;\n", __func__, fname.c_str());
ggml_free(ctx_data); ggml_free(ctx_data);
gguf_free(ctx); gguf_free(ctx);
@ -93,20 +93,20 @@ bool gguf_ex_read_0(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx)); printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv // kv
{ {
const int n_kv = gguf_get_n_kv(ctx); const int n_kv = gguf_get_n_kv(ctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i); const char * key = gguf_get_key(ctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); printf("%s: kv[%d]: key = %s\n", __func__, i, key);
} }
} }
@ -116,10 +116,10 @@ bool gguf_ex_read_0(const std::string & fname) {
const int keyidx = gguf_find_key(ctx, findkey); const int keyidx = gguf_find_key(ctx, findkey);
if (keyidx == -1) { if (keyidx == -1) {
fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey); printf("%s: find key: %s not found.\n", __func__, findkey);
} else { } else {
const char * key_value = gguf_get_val_str(ctx, keyidx); const char * key_value = gguf_get_val_str(ctx, keyidx);
fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value); printf("%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
} }
} }
@ -127,13 +127,13 @@ bool gguf_ex_read_0(const std::string & fname) {
{ {
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i); const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
} }
} }
@ -153,20 +153,20 @@ bool gguf_ex_read_1(const std::string & fname) {
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params); struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx)); printf("%s: version: %d\n", __func__, gguf_get_version(ctx));
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx)); printf("%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv // kv
{ {
const int n_kv = gguf_get_n_kv(ctx); const int n_kv = gguf_get_n_kv(ctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i); const char * key = gguf_get_key(ctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); printf("%s: kv[%d]: key = %s\n", __func__, i, key);
} }
} }
@ -174,13 +174,13 @@ bool gguf_ex_read_1(const std::string & fname) {
{ {
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i); const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i); const size_t offset = gguf_get_tensor_offset(ctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
} }
} }
@ -189,13 +189,13 @@ bool gguf_ex_read_1(const std::string & fname) {
const int n_tensors = gguf_get_n_tensors(ctx); const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
fprintf(stdout, "%s: reading tensor %d data\n", __func__, i); printf("%s: reading tensor %d data\n", __func__, i);
const char * name = gguf_get_tensor_name(ctx, i); const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name); struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data); printf("%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
// print first 10 elements // print first 10 elements
const float * data = (const float *) cur->data; const float * data = (const float *) cur->data;
@ -219,7 +219,7 @@ bool gguf_ex_read_1(const std::string & fname) {
} }
} }
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data)); printf("%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
ggml_free(ctx_data); ggml_free(ctx_data);
gguf_free(ctx); gguf_free(ctx);
@ -229,7 +229,7 @@ bool gguf_ex_read_1(const std::string & fname) {
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
if (argc < 3) { if (argc < 3) {
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]); printf("usage: %s data.gguf r|w\n", argv[0]);
return -1; return -1;
} }

View file

@ -305,9 +305,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
if( cur == NULL ) { if( cur == NULL ) {
fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str()); printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
} else { } else {
// fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); // printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
} }
return cur; return cur;
@ -333,21 +333,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
return false; return false;
} }
fprintf(stdout, "%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
fprintf(stdout, "%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
// print all kv // print all kv
#if 0 #if 0
{ {
const int n_kv = gguf_get_n_kv(ggufctx); const int n_kv = gguf_get_n_kv(ggufctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ggufctx, i); const char * key = gguf_get_key(ggufctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); printf("%s: kv[%d]: key = %s\n", __func__, i, key);
} }
} }
#endif #endif
@ -357,21 +357,21 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
int keyidx; int keyidx;
keyidx = gguf_find_key(ggufctx, "general.name"); keyidx = gguf_find_key(ggufctx, "general.name");
if (keyidx != -1) { fprintf(stdout, "%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.description"); keyidx = gguf_find_key(ggufctx, "general.description");
if (keyidx != -1) { fprintf(stdout, "%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.author"); keyidx = gguf_find_key(ggufctx, "general.author");
if (keyidx != -1) { fprintf(stdout, "%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.license"); keyidx = gguf_find_key(ggufctx, "general.license");
if (keyidx != -1) { fprintf(stdout, "%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.file_type"); keyidx = gguf_find_key(ggufctx, "general.file_type");
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
} }
// check required metadata // check required metadata
@ -382,11 +382,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "falcon") != 0) {
fprintf(stdout, "%s: model architecture not supported!\n", __func__); printf("%s: model architecture not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: gguf model architecture not found!\n", __func__); printf("%s: gguf model architecture not found!\n", __func__);
return false; return false;
} }
@ -394,11 +394,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout"); keyidx = gguf_find_key(ggufctx, "falcon.tensor_data_layout");
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "jploski") != 0) {
fprintf(stdout, "%s: model tensor data layout not supported!\n", __func__); printf("%s: model tensor data layout not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: gguf model tensor data layout not found!\n", __func__); printf("%s: gguf model tensor data layout not found!\n", __func__);
return false; return false;
} }
@ -455,11 +455,11 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
fprintf(stdout, "%s: tokenizer model not supported!\n", __func__); printf("%s: tokenizer model not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: tokenizer model not found!\n", __func__); printf("%s: tokenizer model not found!\n", __func__);
return false; return false;
} }
@ -467,22 +467,22 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
if (tokens_keyidx == -1) { if (tokens_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__); printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
return false; return false;
} }
int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
if (merges_keyidx == -1) { if (merges_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__); printf("%s: gpt2 tokenizer merges not found!\n", __func__);
return false; return false;
} }
hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
fprintf(stdout, "%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
for (size_t i = 0; i < hparams.n_vocab; i++) { for (size_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
@ -523,12 +523,12 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); } if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
} }
@ -543,13 +543,13 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt2bpe_
{ {
const int n_tensors = gguf_get_n_tensors(ggufctx); const int n_tensors = gguf_get_n_tensors(ggufctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ggufctx, i); const char * name = gguf_get_tensor_name (ggufctx, i);
const size_t offset = gguf_get_tensor_offset(ggufctx, i); const size_t offset = gguf_get_tensor_offset(ggufctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
} }
} }
#endif #endif

View file

@ -318,9 +318,9 @@ struct ggml_tensor * get_tensor_ex( struct ggml_context * ctx, std::string name)
struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str()); struct ggml_tensor * cur = ggml_get_tensor(ctx, name.c_str());
if( cur == NULL ) { if( cur == NULL ) {
fprintf(stdout, "%s: tensor '%s' not found!\n", __func__, name.c_str()); printf("%s: tensor '%s' not found!\n", __func__, name.c_str());
} else { } else {
// fprintf(stdout, "%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name); // printf("%s: n_dims = %d, name = '%s'\n", __func__, cur->n_dims, cur->name);
} }
return cur; return cur;
@ -346,21 +346,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
return false; return false;
} }
fprintf(stdout, "%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx)); printf("%s: gguf version = %d\n", __func__, gguf_get_version(ggufctx));
fprintf(stdout, "%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx)); printf("%s: gguf alignment = %zu\n", __func__, gguf_get_alignment(ggufctx));
fprintf(stdout, "%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx)); printf("%s: gguf data offset = %zu\n", __func__, gguf_get_data_offset(ggufctx));
// print all kv // print all kv
#if 0 #if 0
{ {
const int n_kv = gguf_get_n_kv(ggufctx); const int n_kv = gguf_get_n_kv(ggufctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv); printf("%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) { for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ggufctx, i); const char * key = gguf_get_key(ggufctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key); printf("%s: kv[%d]: key = %s\n", __func__, i, key);
} }
} }
#endif #endif
@ -370,21 +370,21 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
int keyidx; int keyidx;
keyidx = gguf_find_key(ggufctx, "general.name"); keyidx = gguf_find_key(ggufctx, "general.name");
if (keyidx != -1) { fprintf(stdout, "%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model name = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.description"); keyidx = gguf_find_key(ggufctx, "general.description");
if (keyidx != -1) { fprintf(stdout, "%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model description = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.author"); keyidx = gguf_find_key(ggufctx, "general.author");
if (keyidx != -1) { fprintf(stdout, "%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model author = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.license"); keyidx = gguf_find_key(ggufctx, "general.license");
if (keyidx != -1) { fprintf(stdout, "%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model license = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { fprintf(stdout, "%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model architecture = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.file_type"); keyidx = gguf_find_key(ggufctx, "general.file_type");
if (keyidx != -1) { fprintf(stdout, "%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model file type = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout"); keyidx = gguf_find_key(ggufctx, "gptneox.tensor_data_layout");
if (keyidx != -1) { fprintf(stdout, "%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model data layout = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository"); keyidx = gguf_find_key(ggufctx, "general.source.hugginface.repository");
if (keyidx != -1) { fprintf(stdout, "%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); } if (keyidx != -1) { printf("%s: model source HF repo = %s\n", __func__, gguf_get_val_str(ggufctx, keyidx)); }
} }
// check required metadata // check required metadata
@ -395,11 +395,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
keyidx = gguf_find_key(ggufctx, "general.architecture"); keyidx = gguf_find_key(ggufctx, "general.architecture");
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gptneox") != 0) {
fprintf(stdout, "%s: model architecture not supported!\n", __func__); printf("%s: model architecture not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: gguf model architecture not found!\n", __func__); printf("%s: gguf model architecture not found!\n", __func__);
return false; return false;
} }
@ -456,11 +456,11 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
if (keyidx != -1) { if (keyidx != -1) {
if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) { if ( strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
fprintf(stdout, "%s: tokenizer model not supported!\n", __func__); printf("%s: tokenizer model not supported!\n", __func__);
return false; return false;
} }
} else { } else {
fprintf(stdout, "%s: tokenizer model not found!\n", __func__); printf("%s: tokenizer model not found!\n", __func__);
return false; return false;
} }
@ -468,22 +468,22 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens"); int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
if (tokens_keyidx == -1) { if (tokens_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer vocab not found!\n", __func__); printf("%s: gpt2 tokenizer vocab not found!\n", __func__);
return false; return false;
} }
int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges"); int merges_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.merges");
if (merges_keyidx == -1) { if (merges_keyidx == -1) {
fprintf(stdout, "%s: gpt2 tokenizer merges not found!\n", __func__); printf("%s: gpt2 tokenizer merges not found!\n", __func__);
return false; return false;
} }
hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx); hparams.n_vocab = gguf_get_arr_n(ggufctx,tokens_keyidx);
hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx); hparams.n_merges = gguf_get_arr_n(ggufctx,merges_keyidx);
fprintf(stdout, "%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab); printf("%s: gpt2 tokenizer vocab = %zu\n", __func__, hparams.n_vocab);
fprintf(stdout, "%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges); printf("%s: gpt2 tokenizer merges = %zu\n", __func__, hparams.n_merges);
for (size_t i = 0; i < hparams.n_vocab; i++) { for (size_t i = 0; i < hparams.n_vocab; i++) {
std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i); std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
@ -524,12 +524,12 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.separator_token_id"); if( keyidx != -1 ) { vocab.special_sep_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); } keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.padding_token_id"); if( keyidx != -1 ) { vocab.special_pad_id = (int32_t)gguf_get_val_u32(ggufctx, keyidx); }
if( vocab.special_bos_id != -1 ) { fprintf(stdout, "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); } if( vocab.special_bos_id != -1 ) { printf("%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].c_str() ); }
if( vocab.special_eos_id != -1 ) { fprintf(stdout, "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); } if( vocab.special_eos_id != -1 ) { printf("%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].c_str() ); }
if( vocab.special_unk_id != -1 ) { fprintf(stdout, "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); } if( vocab.special_unk_id != -1 ) { printf("%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].c_str() ); }
if( vocab.special_sep_id != -1 ) { fprintf(stdout, "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); } if( vocab.special_sep_id != -1 ) { printf("%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].c_str() ); }
if( vocab.special_pad_id != -1 ) { fprintf(stdout, "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); } if( vocab.special_pad_id != -1 ) { printf("%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].c_str() ); }
if( vocab.linefeed_id != -1 ) { fprintf(stdout, "%s: LF token = %d\n", __func__, vocab.linefeed_id ); } if( vocab.linefeed_id != -1 ) { printf("%s: LF token = %d\n", __func__, vocab.linefeed_id ); }
} }
@ -543,13 +543,13 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
{ {
const int n_tensors = gguf_get_n_tensors(ggufctx); const int n_tensors = gguf_get_n_tensors(ggufctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors); printf("%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) { for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ggufctx, i); const char * name = gguf_get_tensor_name (ggufctx, i);
const size_t offset = gguf_get_tensor_offset(ggufctx, i); const size_t offset = gguf_get_tensor_offset(ggufctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset); printf("%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
} }
} }
#endif #endif

View file

@ -165,26 +165,26 @@ static const cmd_params cmd_params_defaults = {
}; };
static void print_usage(int /* argc */, char ** argv) { static void print_usage(int /* argc */, char ** argv) {
fprintf(stdout, "usage: %s [options]\n", argv[0]); printf("usage: %s [options]\n", argv[0]);
fprintf(stdout, "\n"); printf("\n");
fprintf(stdout, "options:\n"); printf("options:\n");
fprintf(stdout, " -h, --help\n"); printf(" -h, --help\n");
fprintf(stdout, " -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
fprintf(stdout, " -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -p, --n-prompt <n> (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
fprintf(stdout, " -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
fprintf(stdout, " -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); printf(" -b, --batch-size <n> (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
fprintf(stdout, " --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str()); printf(" --memory-f32 <0|1> (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
fprintf(stdout, " -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
fprintf(stdout, " -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ngl N, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
fprintf(stdout, " -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
fprintf(stdout, " -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str()); printf(" -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
fprintf(stdout, " -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
fprintf(stdout, " -ts, --tensor_split <ts0/ts1/..> \n"); printf(" -ts, --tensor_split <ts0/ts1/..> \n");
fprintf(stdout, " -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
fprintf(stdout, " -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql"); printf(" -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
fprintf(stdout, " -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
fprintf(stdout, "\n"); printf("\n");
fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
} }

View file

@ -118,7 +118,7 @@ static void server_log(const char *level, const char *function, int line,
} }
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
fprintf(stdout, "%.*s\n", (int)str.size(), str.data()); printf("%.*s\n", (int)str.size(), str.data());
fflush(stdout); fflush(stdout);
} }
@ -694,50 +694,50 @@ struct llama_server_context
static void server_print_usage(const char *argv0, const gpt_params &params, static void server_print_usage(const char *argv0, const gpt_params &params,
const server_params &sparams) const server_params &sparams)
{ {
fprintf(stdout, "usage: %s [options]\n", argv0); printf("usage: %s [options]\n", argv0);
fprintf(stdout, "\n"); printf("\n");
fprintf(stdout, "options:\n"); printf("options:\n");
fprintf(stdout, " -h, --help show this help message and exit\n"); printf(" -h, --help show this help message and exit\n");
fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); printf(" --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); printf(" --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
if (llama_mlock_supported()) if (llama_mlock_supported())
{ {
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }
if (llama_mmap_supported()) if (llama_mmap_supported())
{ {
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
} }
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n"); printf(" --numa attempt optimizations that help on some NUMA systems\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); printf(" -ngl N, --n-gpu-layers N\n");
fprintf(stdout, " number of layers to store in VRAM\n"); printf(" number of layers to store in VRAM\n");
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); printf(" -ts SPLIT --tensor-split SPLIT\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
fprintf(stdout, " -nommq, --no-mul-mat-q\n"); printf(" -nommq, --no-mul-mat-q\n");
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif #endif
fprintf(stdout, " -m FNAME, --model FNAME\n"); printf(" -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); printf(" model path (default: %s)\n", params.model.c_str());
fprintf(stdout, " -a ALIAS, --alias ALIAS\n"); printf(" -a ALIAS, --alias ALIAS\n");
fprintf(stdout, " set an alias for the model, will be added as `model` field in completion response\n"); printf(" set an alias for the model, will be added as `model` field in completion response\n");
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stdout, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); printf(" --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
fprintf(stdout, " --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --port PORT port to listen (default (default: %d)\n", sparams.port);
fprintf(stdout, " --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str());
fprintf(stdout, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout);
fprintf(stdout, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled");
fprintf(stdout, "\n"); printf("\n");
} }
static void server_params_parse(int argc, char **argv, server_params &sparams, static void server_params_parse(int argc, char **argv, server_params &sparams,
@ -1595,7 +1595,7 @@ int main(int argc, char **argv)
svr.set_base_dir(sparams.public_path); svr.set_base_dir(sparams.public_path);
// to make it ctrl+clickable: // to make it ctrl+clickable:
fprintf(stdout, "\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); printf("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
LOG_INFO("HTTP server listening", { LOG_INFO("HTTP server listening", {
{"hostname", sparams.hostname}, {"hostname", sparams.hostname},

View file

@ -6,6 +6,7 @@
#include "common.h" #include "common.h"
#include "llama.h" #include "llama.h"
#include "grammar-parser.h"
#include <cmath> #include <cmath>
#include <cstdio> #include <cstdio>
@ -109,16 +110,35 @@ int main(int argc, char ** argv) {
// used to determine end of generation // used to determine end of generation
bool has_eos = false; bool has_eos = false;
// grammar stuff
struct llama_grammar * grammar_dft = NULL;
struct llama_grammar * grammar_tgt = NULL;
grammar_parser::parse_state parsed_grammar;
// if requested - load the grammar, error checking is omitted for brevity
if (!params.grammar.empty()) {
parsed_grammar = grammar_parser::parse(params.grammar.c_str());
// will be empty (default) if there are parse errors
if (parsed_grammar.rules.empty()) {
return 1;
}
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
}
const auto t_dec_start = ggml_time_us(); const auto t_dec_start = ggml_time_us();
while (true) { while (true) {
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted)); LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
// sample from the drafted tokens if any
int i_dft = 0; int i_dft = 0;
while (true) { while (true) {
const llama_token id = llama_sample_token(ctx_tgt, NULL, NULL, params, last_tokens, candidates, i_dft); // sample from the target model
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
// remember which tokens were sampled - used for repetition penalties during sampling
last_tokens.erase(last_tokens.begin()); last_tokens.erase(last_tokens.begin());
last_tokens.push_back(id); last_tokens.push_back(id);
@ -134,8 +154,9 @@ int main(int argc, char ** argv) {
++n_predict; ++n_predict;
// check if the draft matches the target
if (i_dft < (int) drafted.size() && id == drafted[i_dft]) { if (i_dft < (int) drafted.size() && id == drafted[i_dft]) {
LOG("drafted token %d accepted\n", id); LOG("the sampled target token matches the %dth drafted token (%d, '%s') - accepted\n", i_dft, id, token_str.c_str());
++n_accept; ++n_accept;
++n_past_tgt; ++n_past_tgt;
++n_past_dft; ++n_past_dft;
@ -145,6 +166,14 @@ int main(int argc, char ** argv) {
} }
// the drafted token was rejected or we are out of drafted tokens // the drafted token was rejected or we are out of drafted tokens
if (i_dft < (int) drafted.size()) {
LOG("the %dth drafted token (%d, '%s') does not match the sampled target token (%d, '%s') - rejected\n",
i_dft, drafted[i_dft], llama_token_to_piece(ctx_dft, drafted[i_dft]).c_str(), id, token_str.c_str());
} else {
LOG("out of drafted tokens\n");
}
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads); llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
++n_past_dft; ++n_past_dft;
@ -158,7 +187,16 @@ int main(int argc, char ** argv) {
break; break;
} }
// sample n_draft tokens from the draft model picking the best token if (grammar_tgt) {
if (grammar_dft) {
llama_grammar_free(grammar_dft);
}
grammar_dft = llama_grammar_copy(grammar_tgt);
LOG("copied target grammar to draft grammar\n");
}
// sample n_draft tokens from the draft model using greedy decoding
int n_past_cur = n_past_dft; int n_past_cur = n_past_dft;
for (int i = 0; i < n_draft; ++i) { for (int i = 0; i < n_draft; ++i) {
float * logits = llama_get_logits(ctx_dft); float * logits = llama_get_logits(ctx_dft);
@ -170,25 +208,40 @@ int main(int argc, char ** argv) {
llama_token_data_array cur_p = { candidates.data(), candidates.size(), false }; llama_token_data_array cur_p = { candidates.data(), candidates.size(), false };
if (grammar_dft != NULL) {
llama_sample_grammar(ctx_dft, &cur_p, grammar_dft);
}
// computes softmax and sorts the candidates // computes softmax and sorts the candidates
llama_sample_softmax(ctx_dft, &cur_p); llama_sample_softmax(ctx_dft, &cur_p);
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
LOG(" - draft candidate %d: %d (%.3f)\n", i, cur_p.data[i].id, cur_p.data[i].p); LOG(" - draft candidate %3d: %6d (%8.3f) '%s'\n", i, cur_p.data[i].id, cur_p.data[i].p, llama_token_to_piece(ctx_dft, cur_p.data[i].id).c_str());
} }
// too low probability, stop drafting // TODO: better logic?
if (cur_p.data[0].p < 2*cur_p.data[1].p) { if (cur_p.data[0].p < 2*cur_p.data[1].p) {
LOG("stopping drafting, probability too low: %.3f < 2*%.3f\n", cur_p.data[0].p, cur_p.data[1].p);
break; break;
} }
drafted.push_back(cur_p.data[0].id); // drafted token
const llama_token id = cur_p.data[0].id;
drafted.push_back(id);
++n_drafted; ++n_drafted;
if (i < n_draft - 1) { // no need to evaluate the last drafted token, since we won't use the result
// evaluate the drafted token on the draft model if (i == n_draft - 1) {
llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads); break;
++n_past_cur; }
// evaluate the drafted token on the draft model
llama_eval(ctx_dft, &drafted.back(), 1, n_past_cur, params.n_threads);
++n_past_cur;
if (grammar_dft != NULL) {
llama_grammar_accept_token(ctx_dft, grammar_dft, id);
} }
} }
@ -196,6 +249,7 @@ int main(int argc, char ** argv) {
llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads); llama_eval(ctx_tgt, drafted.data(), drafted.size(), n_past_tgt, params.n_threads);
++n_past_tgt; ++n_past_tgt;
// the first token is always proposed by the traget model before the speculation loop
drafted.erase(drafted.begin()); drafted.erase(drafted.begin());
} }
@ -226,6 +280,10 @@ int main(int argc, char ** argv) {
llama_free(ctx_dft); llama_free(ctx_dft);
llama_free_model(model_dft); llama_free_model(model_dft);
if (grammar_dft != NULL) {
llama_grammar_free(grammar_dft);
llama_grammar_free(grammar_tgt);
}
llama_backend_free(); llama_backend_free();
fprintf(stderr, "\n\n"); fprintf(stderr, "\n\n");

34
grammars/json_arr.gbnf Normal file
View file

@ -0,0 +1,34 @@
# This is the same as json.gbnf but we restrict whitespaces at the end of the root array
# Useful for generating JSON arrays
root ::= arr
value ::= object | array | string | number | ("true" | "false" | "null") ws
arr ::=
"[\n" ws (
value
(",\n" ws value)*
)? "]"
object ::=
"{" ws (
string ":" ws value
("," ws string ":" ws value)*
)? "}" ws
array ::=
"[" ws (
value
("," ws value)*
)? "]" ws
string ::=
"\"" (
[^"\\] |
"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
)* "\"" ws
number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
# Optional space: by convention, applied in this grammar after literal chars when allowed
ws ::= ([ \t\n] ws)?

View file

@ -83,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
float ax = fabsf(x[i]); float ax = fabsf(x[i]);
if (ax > amax) { amax = ax; max = x[i]; } if (ax > amax) { amax = ax; max = x[i]; }
} }
if (!amax) { // all zero if (amax < 1e-30f) { // all zero
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
L[i] = 0; L[i] = 0;
} }
@ -1086,6 +1086,12 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
} }
if (!max_abs_scale) {
memset(&y[i], 0, sizeof(block_q6_K));
y[i].d = ggml_fp32_to_fp16(0.f);
continue;
}
float iscale = -128.f/max_scale; float iscale = -128.f/max_scale;
y[i].d = ggml_fp32_to_fp16(1/iscale); y[i].d = ggml_fp32_to_fp16(1/iscale);
for (int ib = 0; ib < QK_K/16; ++ib) { for (int ib = 0; ib < QK_K/16; ++ib) {

View file

@ -2942,7 +2942,12 @@ static bool llama_eval_internal(
// for big prompts, if BLAS is enabled, it is better to use only one thread // for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
// we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
// with the BLAS calls. need a better solution
if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
n_threads = std::min(4, n_threads);
}
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2]; struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
@ -3850,6 +3855,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
delete grammar; delete grammar;
} }
struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
// redirect elements in stacks to point to new rules
for (size_t is = 0; is < result->stacks.size(); is++) {
for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
result->stacks[is][ie] = &result->rules[ir0][ir1];
}
}
}
}
}
return result;
}
// //
// sampling // sampling
// //

View file

@ -410,6 +410,8 @@ extern "C" {
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar); LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
// //
// Sampling functions // Sampling functions
// //