Merge branch 'master' into fix-eos

This commit is contained in:
Georgi Gerganov 2023-08-23 22:40:19 +03:00
commit 977629a34e
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
82 changed files with 11987 additions and 3361 deletions

View file

@ -6,27 +6,6 @@ find_package(Threads REQUIRED)
# ...
# common
set(TARGET common)
add_library(${TARGET} OBJECT
common.h
common.cpp
console.h
console.cpp
grammar-parser.h
grammar-parser.cpp
)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE llama)
# examples
include_directories(${CMAKE_CURRENT_SOURCE_DIR})

View file

@ -1,718 +0,0 @@
#include "common.h"
#include <cassert>
#include <iostream>
#include <cstring>
#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
#include <sstream>
#include <unordered_set>
#include <regex>
#if defined(__APPLE__) && defined(__MACH__)
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <fcntl.h>
#include <io.h>
#else
#include <sys/ioctl.h>
#include <unistd.h>
#endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
int32_t get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
std::unordered_set<std::string> siblings;
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
std::ifstream thread_siblings("/sys/devices/system/cpu"
+ std::to_string(cpu) + "/topology/thread_siblings");
if (!thread_siblings.is_open()) {
break; // no more cpus
}
std::string line;
if (std::getline(thread_siblings, line)) {
siblings.insert(line);
}
}
if (siblings.size() > 0) {
return static_cast<int32_t>(siblings.size());
}
#elif defined(__APPLE__) && defined(__MACH__)
int32_t num_physical_cores;
size_t len = sizeof(num_physical_cores);
int result = sysctlbyname("hw.perflevel0.physicalcpu", &num_physical_cores, &len, NULL, 0);
if (result == 0) {
return num_physical_cores;
}
result = sysctlbyname("hw.physicalcpu", &num_physical_cores, &len, NULL, 0);
if (result == 0) {
return num_physical_cores;
}
#elif defined(_WIN32)
//TODO: Implement
#endif
unsigned int n_threads = std::thread::hardware_concurrency();
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}
void process_escapes(std::string& input) {
std::size_t input_len = input.length();
std::size_t output_idx = 0;
for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
switch (input[++input_idx]) {
case 'n': input[output_idx++] = '\n'; break;
case 'r': input[output_idx++] = '\r'; break;
case 't': input[output_idx++] = '\t'; break;
case '\'': input[output_idx++] = '\''; break;
case '\"': input[output_idx++] = '\"'; break;
case '\\': input[output_idx++] = '\\'; break;
default: input[output_idx++] = '\\';
input[output_idx++] = input[input_idx]; break;
}
} else {
input[output_idx++] = input[input_idx];
}
}
input.resize(output_idx);
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
bool invalid_param = false;
bool escape_prompt = false;
std::string arg;
gpt_params default_params;
const std::string arg_prefix = "--";
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
std::replace(arg.begin(), arg.end(), '_', '-');
}
if (arg == "-s" || arg == "--seed") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.seed = std::stoul(argv[i]);
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_threads = std::stoi(argv[i]);
if (params.n_threads <= 0) {
params.n_threads = std::thread::hardware_concurrency();
}
} else if (arg == "-p" || arg == "--prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.prompt = argv[i];
} else if (arg == "-e") {
escape_prompt = true;
} else if (arg == "--prompt-cache") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.path_prompt_cache = argv[i];
} else if (arg == "--prompt-cache-all") {
params.prompt_cache_all = true;
} else if (arg == "--prompt-cache-ro") {
params.prompt_cache_ro = true;
} else if (arg == "-f" || arg == "--file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
params.prompt.pop_back();
}
} else if (arg == "-n" || arg == "--n-predict") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_predict = std::stoi(argv[i]);
} else if (arg == "--top-k") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.top_k = std::stoi(argv[i]);
} else if (arg == "-c" || arg == "--ctx-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_ctx = std::stoi(argv[i]);
} else if (arg == "-gqa" || arg == "--gqa") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_gqa = std::stoi(argv[i]);
} else if (arg == "-eps" || arg == "--rms-norm-eps") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rms_norm_eps = std::stof(argv[i]);
} else if (arg == "--rope-freq-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_base = std::stof(argv[i]);
} else if (arg == "--rope-freq-scale") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_scale = std::stof(argv[i]);
} else if (arg == "--rope-scale") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_scale = 1.0f/std::stof(argv[i]);
} else if (arg == "--memory-f32") {
params.memory_f16 = false;
} else if (arg == "--top-p") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.top_p = std::stof(argv[i]);
} else if (arg == "--temp") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.temp = std::stof(argv[i]);
} else if (arg == "--tfs") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.tfs_z = std::stof(argv[i]);
} else if (arg == "--typical") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.typical_p = std::stof(argv[i]);
} else if (arg == "--repeat-last-n") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.repeat_last_n = std::stoi(argv[i]);
} else if (arg == "--repeat-penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.repeat_penalty = std::stof(argv[i]);
} else if (arg == "--frequency-penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.frequency_penalty = std::stof(argv[i]);
} else if (arg == "--presence-penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.presence_penalty = std::stof(argv[i]);
} else if (arg == "--mirostat") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.mirostat = std::stoi(argv[i]);
} else if (arg == "--mirostat-lr") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.mirostat_eta = std::stof(argv[i]);
} else if (arg == "--mirostat-ent") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.mirostat_tau = std::stof(argv[i]);
} else if (arg == "--cfg-negative-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cfg_negative_prompt = argv[i];
} else if (arg == "--cfg-negative-prompt-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.cfg_negative_prompt));
if (params.cfg_negative_prompt.back() == '\n') {
params.cfg_negative_prompt.pop_back();
}
} else if (arg == "--cfg-scale") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cfg_scale = std::stof(argv[i]);
} else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_batch = std::stoi(argv[i]);
params.n_batch = std::min(512, params.n_batch);
} else if (arg == "--keep") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_keep = std::stoi(argv[i]);
} else if (arg == "--chunks") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_chunks = std::stoi(argv[i]);
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model = argv[i];
} else if (arg == "-a" || arg == "--alias") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model_alias = argv[i];
} else if (arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_adapter = argv[i];
params.use_mmap = false;
} else if (arg == "--lora-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_base = argv[i];
} else if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
} else if (arg == "--embedding") {
params.embedding = true;
} else if (arg == "--interactive-first") {
params.interactive_first = true;
} else if (arg == "-ins" || arg == "--instruct") {
params.instruct = true;
} else if (arg == "--multiline-input") {
params.multiline_input = true;
} else if (arg == "--simple-io") {
params.simple_io = true;
} else if (arg == "--color") {
params.use_color = true;
} else if (arg == "--mlock") {
params.use_mlock = true;
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
params.main_gpu = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
#endif
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
std::string arg_next = argv[i];
// split string by , and /
const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
std::vector<std::string> split_arg{it, {}};
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
if (i < split_arg.size()) {
params.tensor_split[i] = std::stof(split_arg[i]);
} else {
params.tensor_split[i] = 0.0f;
}
}
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--mul-mat-q" || arg == "-mmq") {
#ifdef GGML_USE_CUBLAS
params.mul_mat_q = true;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--low-vram" || arg == "-lv") {
#ifdef GGML_USE_CUBLAS
params.low_vram = true;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--numa") {
params.numa = true;
} else if (arg == "--export") {
params.export_cgraph = true;
} else if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.antiprompt.push_back(argv[i]);
} else if (arg == "--perplexity") {
params.perplexity = true;
} else if (arg == "--hellaswag") {
params.hellaswag = true;
} else if (arg == "--hellaswag-tasks") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.hellaswag_tasks = std::stoi(argv[i]);
} else if (arg == "--ignore-eos") {
params.logit_bias[llama_token_eos()] = -INFINITY;
} else if (arg == "--no-penalize-nl") {
params.penalize_nl = false;
} else if (arg == "-l" || arg == "--logit-bias") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::stringstream ss(argv[i]);
llama_token key;
char sign;
std::string value_str;
try {
if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) {
params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f);
} else {
throw std::exception();
}
} catch (const std::exception&) {
invalid_param = true;
break;
}
} else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, default_params);
exit(0);
} else if (arg == "--random-prompt") {
params.random_prompt = true;
} else if (arg == "--in-prefix-bos") {
params.input_prefix_bos = true;
} else if (arg == "--in-prefix") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.input_prefix = argv[i];
} else if (arg == "--in-suffix") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.input_suffix = argv[i];
} else if (arg == "--grammar") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.grammar = argv[i];
} else if (arg == "--grammar-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::copy(
std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>(),
std::back_inserter(params.grammar)
);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params);
exit(1);
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params);
exit(1);
}
if (params.prompt_cache_all &&
(params.interactive || params.interactive_first ||
params.instruct)) {
fprintf(stderr, "error: --prompt-cache-all not supported in interactive mode yet\n");
gpt_print_usage(argc, argv, default_params);
exit(1);
}
if (escape_prompt) {
process_escapes(params.prompt);
process_escapes(params.input_prefix);
process_escapes(params.input_suffix);
}
return true;
}
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, "usage: %s [options]\n", argv[0]);
fprintf(stdout, "\n");
fprintf(stdout, "options:\n");
fprintf(stdout, " -h, --help show this help message and exit\n");
fprintf(stdout, " -i, --interactive run in interactive mode\n");
fprintf(stdout, " --interactive-first run in interactive mode and wait for input right away\n");
fprintf(stdout, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
fprintf(stdout, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
fprintf(stdout, " -r PROMPT, --reverse-prompt PROMPT\n");
fprintf(stdout, " halt generation at PROMPT, return control in interactive mode\n");
fprintf(stdout, " (can be specified more than once for multiple prompts).\n");
fprintf(stdout, " --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -p PROMPT, --prompt PROMPT\n");
fprintf(stdout, " prompt to start generation with (default: empty)\n");
fprintf(stdout, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
fprintf(stdout, " not supported with --interactive or other interactive options\n");
fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
fprintf(stdout, " --random-prompt start with a randomized prompt.\n");
fprintf(stdout, " --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
fprintf(stdout, " -f FNAME, --file FNAME\n");
fprintf(stdout, " prompt file to start generation.\n");
fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k);
fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p);
fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z);
fprintf(stdout, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p);
fprintf(stdout, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n);
fprintf(stdout, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
fprintf(stdout, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
fprintf(stdout, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
fprintf(stdout, " --mirostat N use Mirostat sampling.\n");
fprintf(stdout, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
fprintf(stdout, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
fprintf(stdout, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta);
fprintf(stdout, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau);
fprintf(stdout, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
fprintf(stdout, " modifies the likelihood of token appearing in the completion,\n");
fprintf(stdout, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
fprintf(stdout, " --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
fprintf(stdout, " --grammar-file FNAME file to read grammar from\n");
fprintf(stdout, " --cfg-negative-prompt PROMPT\n");
fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-negative-prompt-file FNAME\n");
fprintf(stdout, " negative prompt file to use for guidance. (default: empty)\n");
fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale);
fprintf(stdout, " --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale (default: %g)\n", 1.0f/params.rope_freq_scale);
fprintf(stdout, " --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
fprintf(stdout, " --no-penalize-nl do not penalize newline token\n");
fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n");
fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp);
fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n");
fprintf(stdout, " --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
fprintf(stdout, " --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
fprintf(stdout, " --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
if (llama_mlock_supported()) {
fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
if (llama_mmap_supported()) {
fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n");
fprintf(stdout, " if run without this previously, it is recommended to drop the system page cache before using this\n");
fprintf(stdout, " see https://github.com/ggerganov/llama.cpp/issues/1437\n");
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
fprintf(stdout, " -ngl N, --n-gpu-layers N\n");
fprintf(stdout, " number of layers to store in VRAM\n");
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
#endif
fprintf(stdout, " --mtest compute maximum memory usage\n");
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
fprintf(stdout, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stdout, " -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
fprintf(stdout, "\n");
}
std::string gpt_random_prompt(std::mt19937 & rng) {
const int r = rng() % 10;
switch (r) {
case 0: return "So";
case 1: return "Once upon a time";
case 2: return "When";
case 3: return "The";
case 4: return "After";
case 5: return "If";
case 6: return "import";
case 7: return "He";
case 8: return "She";
case 9: return "They";
default: return "To";
}
return "The";
}
// TODO: not great allocating this every time
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
std::vector<llama_token> res(text.size() + (int) add_bos);
const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
assert(n >= 0);
res.resize(n);
return res;
}
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_batch = params.n_batch;
lparams.n_gqa = params.n_gqa;
lparams.rms_norm_eps = params.rms_norm_eps;
lparams.n_gpu_layers = params.n_gpu_layers;
lparams.main_gpu = params.main_gpu;
lparams.tensor_split = params.tensor_split;
lparams.low_vram = params.low_vram;
lparams.mul_mat_q = params.mul_mat_q;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding;
lparams.rope_freq_base = params.rope_freq_base;
lparams.rope_freq_scale = params.rope_freq_scale;
return lparams;
}
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
auto lparams = llama_context_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return std::make_tuple(nullptr, nullptr);
}
llama_context * lctx = llama_new_context_with_model(model, lparams);
if (lctx == NULL) {
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
if (!params.lora_adapter.empty()) {
int err = llama_model_apply_lora_from_file(model,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
llama_free(lctx);
llama_free_model(model);
return std::make_tuple(nullptr, nullptr);
}
}
return std::make_tuple(model, lctx);
}

View file

@ -1,114 +0,0 @@
// Various helper functions and utilities
#pragma once
#include "llama.h"
#include <string>
#include <vector>
#include <random>
#include <thread>
#include <unordered_map>
#include <tuple>
//
// CLI argument parsing
//
int32_t get_num_physical_cores();
struct gpt_params {
uint32_t seed = -1; // RNG seed
int32_t n_threads = get_num_physical_cores();
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_gqa = 1; // grouped-query attention factor (TODO: move to hparams)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
float rope_freq_base = 10000.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
// sampling parameters
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
int32_t top_k = 40; // <= 0 to use vocab size
float top_p = 0.95f; // 1.0 = disabled
float tfs_z = 1.00f; // 1.0 = disabled
float typical_p = 1.00f; // 1.0 = disabled
float temp = 0.80f; // 1.0 = disabled
float repeat_penalty = 1.10f; // 1.0 = disabled
int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
float frequency_penalty = 0.00f; // 0.0 = disabled
float presence_penalty = 0.00f; // 0.0 = disabled
int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
float mirostat_tau = 5.00f; // target entropy
float mirostat_eta = 0.10f; // learning rate
// Classifier-Free Guidance
// https://arxiv.org/abs/2306.17806
std::string cfg_negative_prompt; // string to help guidance
float cfg_scale = 1.f; // How strong is guidance
std::string model = "models/7B/ggml-model.bin"; // model path
std::string model_alias = "unknown"; // model alias
std::string prompt = "";
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::string grammar = ""; // optional BNF-like grammar to constrain sampling
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
bool mul_mat_q = false; // if true, use experimental mul_mat_q kernels
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
bool embedding = false; // get only sentence embedding
bool interactive_first = false; // wait for user input immediately
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool instruct = false; // instruction mode (used for Alpaca models)
bool penalize_nl = true; // consider newlines as a repeatable token
bool perplexity = false; // compute perplexity over the prompt
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool numa = false; // attempt optimizations that help on some NUMA systems
bool export_cgraph = false; // export the computation graph
bool verbose_prompt = false; // print prompt tokens before generation
};
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
//
// Vocab utils
//
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
//
// Model utils
//
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);

View file

@ -1,500 +0,0 @@
#include "console.h"
#include <vector>
#include <iostream>
#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include <windows.h>
#include <fcntl.h>
#include <io.h>
#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
#endif
#else
#include <climits>
#include <sys/ioctl.h>
#include <unistd.h>
#include <wchar.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <termios.h>
#endif
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m"
#define ANSI_COLOR_BLUE "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN "\x1b[36m"
#define ANSI_COLOR_RESET "\x1b[0m"
#define ANSI_BOLD "\x1b[1m"
namespace console {
//
// Console state
//
static bool advanced_display = false;
static bool simple_io = true;
static display_t current_display = reset;
static FILE* out = stdout;
#if defined (_WIN32)
static void* hConsole;
#else
static FILE* tty = nullptr;
static termios initial_state;
#endif
//
// Init and cleanup
//
void init(bool use_simple_io, bool use_advanced_display) {
advanced_display = use_advanced_display;
simple_io = use_simple_io;
#if defined(_WIN32)
// Windows-specific console initialization
DWORD dwMode = 0;
hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) {
hConsole = GetStdHandle(STD_ERROR_HANDLE);
if (hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(hConsole, &dwMode))) {
hConsole = nullptr;
simple_io = true;
}
}
if (hConsole) {
// Check conditions combined to reduce nesting
if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
!SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
advanced_display = false;
}
// Set console output codepage to UTF8
SetConsoleOutputCP(CP_UTF8);
}
HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
// Set console input codepage to UTF16
_setmode(_fileno(stdin), _O_WTEXT);
// Set ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
if (simple_io) {
dwMode |= ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT;
} else {
dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
}
if (!SetConsoleMode(hConIn, dwMode)) {
simple_io = true;
}
}
#else
// POSIX-specific console initialization
if (!simple_io) {
struct termios new_termios;
tcgetattr(STDIN_FILENO, &initial_state);
new_termios = initial_state;
new_termios.c_lflag &= ~(ICANON | ECHO);
new_termios.c_cc[VMIN] = 1;
new_termios.c_cc[VTIME] = 0;
tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
tty = fopen("/dev/tty", "w+");
if (tty != nullptr) {
out = tty;
}
}
setlocale(LC_ALL, "");
#endif
}
void cleanup() {
// Reset console display
set_display(reset);
#if !defined(_WIN32)
// Restore settings on POSIX systems
if (!simple_io) {
if (tty != nullptr) {
out = stdout;
fclose(tty);
tty = nullptr;
}
tcsetattr(STDIN_FILENO, TCSANOW, &initial_state);
}
#endif
}
//
// Display and IO
//
// Keep track of current display and only emit ANSI code if it changes
void set_display(display_t display) {
if (advanced_display && current_display != display) {
fflush(stdout);
switch(display) {
case reset:
fprintf(out, ANSI_COLOR_RESET);
break;
case prompt:
fprintf(out, ANSI_COLOR_YELLOW);
break;
case user_input:
fprintf(out, ANSI_BOLD ANSI_COLOR_GREEN);
break;
case error:
fprintf(out, ANSI_BOLD ANSI_COLOR_RED);
}
current_display = display;
fflush(out);
}
}
char32_t getchar32() {
#if defined(_WIN32)
HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE);
wchar_t high_surrogate = 0;
while (true) {
INPUT_RECORD record;
DWORD count;
if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) {
return WEOF;
}
if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) {
wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar;
if (wc == 0) {
continue;
}
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
high_surrogate = wc;
continue;
}
if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate
if (high_surrogate != 0) { // Check if we have a high surrogate
return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000;
}
}
high_surrogate = 0; // Reset the high surrogate
return static_cast<char32_t>(wc);
}
}
#else
wchar_t wc = getwchar();
if (static_cast<wint_t>(wc) == WEOF) {
return WEOF;
}
#if WCHAR_MAX == 0xFFFF
if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
wchar_t low_surrogate = getwchar();
if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
}
}
if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
return 0xFFFD; // Return the replacement character U+FFFD
}
#endif
return static_cast<char32_t>(wc);
#endif
}
void pop_cursor() {
#if defined(_WIN32)
if (hConsole != NULL) {
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
GetConsoleScreenBufferInfo(hConsole, &bufferInfo);
COORD newCursorPosition = bufferInfo.dwCursorPosition;
if (newCursorPosition.X == 0) {
newCursorPosition.X = bufferInfo.dwSize.X - 1;
newCursorPosition.Y -= 1;
} else {
newCursorPosition.X -= 1;
}
SetConsoleCursorPosition(hConsole, newCursorPosition);
return;
}
#endif
putc('\b', out);
}
int estimateWidth(char32_t codepoint) {
#if defined(_WIN32)
return 1;
#else
return wcwidth(codepoint);
#endif
}
int put_codepoint(const char* utf8_codepoint, size_t length, int expectedWidth) {
#if defined(_WIN32)
CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
if (!GetConsoleScreenBufferInfo(hConsole, &bufferInfo)) {
// go with the default
return expectedWidth;
}
COORD initialPosition = bufferInfo.dwCursorPosition;
DWORD nNumberOfChars = length;
WriteConsole(hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
// Figure out our real position if we're in the last column
if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
DWORD nNumberOfChars;
WriteConsole(hConsole, &" \b", 2, &nNumberOfChars, NULL);
GetConsoleScreenBufferInfo(hConsole, &newBufferInfo);
}
int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
if (width < 0) {
width += newBufferInfo.dwSize.X;
}
return width;
#else
// We can trust expectedWidth if we've got one
if (expectedWidth >= 0 || tty == nullptr) {
fwrite(utf8_codepoint, length, 1, out);
return expectedWidth;
}
fputs("\033[6n", tty); // Query cursor position
int x1;
int y1;
int x2;
int y2;
int results = 0;
results = fscanf(tty, "\033[%d;%dR", &y1, &x1);
fwrite(utf8_codepoint, length, 1, tty);
fputs("\033[6n", tty); // Query cursor position
results += fscanf(tty, "\033[%d;%dR", &y2, &x2);
if (results != 4) {
return expectedWidth;
}
int width = x2 - x1;
if (width < 0) {
// Calculate the width considering text wrapping
struct winsize w;
ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
width += w.ws_col;
}
return width;
#endif
}
void replace_last(char ch) {
#if defined(_WIN32)
pop_cursor();
put_codepoint(&ch, 1, 1);
#else
fprintf(out, "\b%c", ch);
#endif
}
void append_utf8(char32_t ch, std::string & out) {
if (ch <= 0x7F) {
out.push_back(static_cast<unsigned char>(ch));
} else if (ch <= 0x7FF) {
out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
} else if (ch <= 0xFFFF) {
out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
} else if (ch <= 0x10FFFF) {
out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
} else {
// Invalid Unicode code point
}
}
// Helper function to remove the last UTF-8 character from a string
void pop_back_utf8_char(std::string & line) {
if (line.empty()) {
return;
}
size_t pos = line.length() - 1;
// Find the start of the last UTF-8 character (checking up to 4 bytes back)
for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
if ((line[pos] & 0xC0) != 0x80) {
break; // Found the start of the character
}
}
line.erase(pos);
}
bool readline_advanced(std::string & line, bool multiline_input) {
if (out != stdout) {
fflush(stdout);
}
line.clear();
std::vector<int> widths;
bool is_special_char = false;
bool end_of_stream = false;
char32_t input_char;
while (true) {
fflush(out); // Ensure all output is displayed before waiting for input
input_char = getchar32();
if (input_char == '\r' || input_char == '\n') {
break;
}
if (input_char == (char32_t) WEOF || input_char == 0x04 /* Ctrl+D*/) {
end_of_stream = true;
break;
}
if (is_special_char) {
set_display(user_input);
replace_last(line.back());
is_special_char = false;
}
if (input_char == '\033') { // Escape sequence
char32_t code = getchar32();
if (code == '[' || code == 0x1B) {
// Discard the rest of the escape sequence
while ((code = getchar32()) != (char32_t) WEOF) {
if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
break;
}
}
}
} else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
if (!widths.empty()) {
int count;
do {
count = widths.back();
widths.pop_back();
// Move cursor back, print space, and move cursor back again
for (int i = 0; i < count; i++) {
replace_last(' ');
pop_cursor();
}
pop_back_utf8_char(line);
} while (count == 0 && !widths.empty());
}
} else {
int offset = line.length();
append_utf8(input_char, line);
int width = put_codepoint(line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
if (width < 0) {
width = 0;
}
widths.push_back(width);
}
if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
set_display(prompt);
replace_last(line.back());
is_special_char = true;
}
}
bool has_more = multiline_input;
if (is_special_char) {
replace_last(' ');
pop_cursor();
char last = line.back();
line.pop_back();
if (last == '\\') {
line += '\n';
fputc('\n', out);
has_more = !has_more;
} else {
// llama will just eat the single space, it won't act as a space
if (line.length() == 1 && line.back() == ' ') {
line.clear();
pop_cursor();
}
has_more = false;
}
} else {
if (end_of_stream) {
has_more = false;
} else {
line += '\n';
fputc('\n', out);
}
}
fflush(out);
return has_more;
}
bool readline_simple(std::string & line, bool multiline_input) {
#if defined(_WIN32)
std::wstring wline;
if (!std::getline(std::wcin, wline)) {
// Input stream is bad or EOF received
line.clear();
GenerateConsoleCtrlEvent(CTRL_C_EVENT, 0);
return false;
}
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), NULL, 0, NULL, NULL);
line.resize(size_needed);
WideCharToMultiByte(CP_UTF8, 0, &wline[0], (int)wline.size(), &line[0], size_needed, NULL, NULL);
#else
if (!std::getline(std::cin, line)) {
// Input stream is bad or EOF received
line.clear();
return false;
}
#endif
if (!line.empty()) {
char last = line.back();
if (last == '/') { // Always return control on '/' symbol
line.pop_back();
return false;
}
if (last == '\\') { // '\\' changes the default action
line.pop_back();
multiline_input = !multiline_input;
}
}
line += '\n';
// By default, continue input if multiline_input is set
return multiline_input;
}
bool readline(std::string & line, bool multiline_input) {
set_display(user_input);
if (simple_io) {
return readline_simple(line, multiline_input);
}
return readline_advanced(line, multiline_input);
}
}

View file

@ -1,19 +0,0 @@
// Console functions
#pragma once
#include <string>
namespace console {
enum display_t {
reset = 0,
prompt,
user_input,
error
};
void init(bool use_simple_io, bool use_advanced_display);
void cleanup();
void set_display(display_t display);
bool readline(std::string & line, bool multiline_input);
}

View file

@ -12,15 +12,19 @@ usage: ./convert-llama2c-to-ggml [options]
options:
-h, --help show this help message and exit
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'models/ggml-vocab.bin')
--copy-vocab-from-model FNAME model path from which to copy vocab (default 'tokenizer.bin')
--llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model
--llama2c-output-model FNAME model path to save the converted llama2.c model (default ak_llama_model.bin')
```
An example command is as follows:
An example command using a model from [karpathy/tinyllamas](https://huggingface.co/karpathy/tinyllamas) is as follows:
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
`$ ./convert-llama2c-to-ggml --copy-vocab-from-model ../llama2.c/tokenizer.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.ggmlv3.bin`
Now you can use the model with command like:
For now the generated model is in the legacy GGJTv3 format, so you need to convert it to gguf manually:
`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`
`$ python ./convert-llama-ggmlv3-to-gguf.py --eps 1e-5 --input stories42M.ggmlv3.bin --output stories42M.gguf.bin`
Now you can use the model with a command like:
`$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256`

View file

@ -1,5 +1,6 @@
#include "ggml.h"
#include "llama.h"
#include <unordered_map>
#include <vector>
#include <cassert>
@ -16,6 +17,9 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt'
#define LLAMA_FILE_VERSION_GGJT_V3 3
//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
typedef struct {
int dim; // transformer dimension
@ -48,10 +52,10 @@ typedef struct {
// float* freq_cis_real; // (seq_len, dim/2)
// float* freq_cis_imag; // (seq_len, dim/2)
// (optional) classifier weights for the logits, on the last layer
//float* wcls;
float* wcls;
} TransformerWeights;
void malloc_weights(TransformerWeights* w, Config* p) {
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
// we calloc instead of malloc to keep valgrind happy
w->token_embedding_table = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
@ -85,9 +89,16 @@ void malloc_weights(TransformerWeights* w, Config* p) {
w->rms_final_weight = new float[p->dim]();
printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
if (shared_weights) {
w->wcls = NULL;
} else {
w->wcls = new float[p->vocab_size * p->dim]();
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
}
}
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
@ -99,6 +110,22 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
// Skip freq_cis_real & freq_cis_imag
int head_size = p->dim / p->n_heads;
fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR);
if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
// Check we didn't forget to read anything
auto curr = ftell(f);
fseek(f, 0, SEEK_END);
auto end = ftell(f);
if (curr != end) {
printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end);
return 1;
}
return 0;
}
@ -114,6 +141,7 @@ void free_weights(TransformerWeights* w) {
delete w->w2;
delete w->w3;
delete w->rms_final_weight;
if (w->wcls) delete w->wcls;
}
void print_sample_weights(TransformerWeights *w){
@ -130,6 +158,7 @@ void print_sample_weights(TransformerWeights *w){
printf("%f\n", w->w2[0]);
printf("%f\n", w->w3[0]);
printf("%f\n", w->rms_att_weight[0]);
if (w->wcls) printf("%f\n", w->wcls[0]);
}
////////////////////////////////////////////////////////////////////////////////////////////////////////////
@ -138,14 +167,16 @@ void print_sample_weights(TransformerWeights *w){
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
struct token_score {
token tok;
struct token_data {
token text;
float score;
ttype type;
};
std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
std::vector<token_data> id_to_token;
};
struct my_llama_hparams {
@ -502,49 +533,51 @@ bool is_ggml_file(const char *filename) {
return false;
}
uint32_t magic = file.read_u32();
return magic == LLAMA_FILE_MAGIC;
return magic == GGUF_MAGIC;
}
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
// heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
if (is_ggml_file(filename)) {
struct llama_context_params llama_params = llama_context_default_params();
llama_params.vocab_only = true;
struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
std::vector<const char *> strings;
std::vector<float> scores;
int n_vocab = llama_n_vocab(lctx);
strings.resize(n_vocab, NULL);
scores.resize(n_vocab, 0);
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
vocab->id_to_token.resize(n_vocab);
for (int i=0; i<n_vocab; ++i) {
std::string tok = std::string(strings[i]);
float score = scores[i];
vocab->id_to_token[i].tok = tok;
vocab->id_to_token[i].score = score;
vocab->token_to_id.emplace(tok, i);
}
llama_free(lctx);
llama_free_model(lmodel);
} else { // assume llama2.c vocabulary
#pragma message("TODO: implement reading vocabulary using gguf")
// // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
// if (is_ggml_file(filename)) {
//
// struct llama_context_params llama_params = llama_context_default_params();
// llama_params.vocab_only = true;
//
// struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
// struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
//
// const int n_vocab = llama_n_vocab(lctx);
// vocab->id_to_token.resize(n_vocab);
// for (int i=0; i<n_vocab; ++i) {
// vocab->id_to_token[i].text = llama_token_get_text(lctx, i);
// vocab->id_to_token[i].score = llama_token_get_score(lctx, i);
// vocab->id_to_token[i].type = llama_token_get_type(lctx, i);
// vocab->token_to_id.emplace(vocab->id_to_token[i].text, i);
// }
// llama_free(lctx);
// llama_free_model(lmodel);
// } else
{ // assume llama2.c vocabulary
printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
llama_file file(filename, "rb");
uint32_t n_vocab = config->vocab_size;
const int n_vocab = config->vocab_size;
/* uint32_t max_token_length = */ file.read_u32(); // unused
vocab->id_to_token.resize(n_vocab);
for (uint32_t i=0; i<n_vocab; ++i) {
for (int i=0; i<n_vocab; ++i) {
float_t score = file.read_f32();
uint32_t len = file.read_u32();
std::string tok = file.read_string(len);
vocab->id_to_token[i].tok = tok;
std::string text = file.read_string(len);
// Special-case handling of <0xXX> single byte tokens.
char byte_val;
if (sscanf(text.c_str(), "<0x%02hhX>", &byte_val) == 1) {
char cstr[2] = { byte_val, 0 };
text = cstr;
}
vocab->id_to_token[i].text = text;
vocab->id_to_token[i].score = score;
vocab->token_to_id.emplace(tok, i);
vocab->id_to_token[i].type = LLAMA_TOKEN_TYPE_UNDEFINED;
vocab->token_to_id.emplace(text, i);
}
}
}
@ -590,9 +623,11 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
if (file.fp == NULL) {
return;
}
#pragma message("TODO: implement file saving using gguf")
// write_magic
file.write_u32(LLAMA_FILE_MAGIC); // magic
file.write_u32(LLAMA_FILE_VERSION); // version
file.write_u32(LLAMA_FILE_MAGIC_GGJT); // magic
file.write_u32(LLAMA_FILE_VERSION_GGJT_V3); // version
// write_hparams
file.write_u32(model->hparams.n_vocab);
file.write_u32(model->hparams.n_embd);
@ -605,17 +640,17 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
// write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
uint32_t n_vocab = model->hparams.n_vocab;
for (uint32_t i = 0; i < n_vocab; i++) {
const auto & token_score = vocab->id_to_token.at(i);
file.write_u32((uint32_t) token_score.tok.size());
file.write_raw(token_score.tok.data(), token_score.tok.size());
file.write_raw(&token_score.score, sizeof(token_score.score));
const auto & token_data = vocab->id_to_token.at(i);
file.write_u32((uint32_t) token_data.text.size());
file.write_raw(token_data.text.data(), token_data.text.size());
file.write_raw(&token_data.score, sizeof(token_data.score));
}
// stuff AK weights into GG weights one by one.
// w->token_embedding_table -> model->tok_embeddings
// float* -> struct ggml_tensor
stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
//print_row(model->norm, 0);
@ -663,7 +698,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
struct train_params get_default_train_params() {
struct train_params params;
params.fn_vocab_model = "models/ggml-vocab.bin";
params.fn_vocab_model = "tokenizer.bin";
params.fn_llama2c_output_model = "ak_llama_model.bin";
params.fn_train_data = "shakespeare.txt";
params.fn_checkpoint_in = "checkpoint.bin";
@ -716,7 +751,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
fprintf(stderr, " --copy-vocab-from-model FNAME llama2.c vocabulary or ggmlv3 model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
fprintf(stderr, " --llama2c-model FNAME [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
fprintf(stderr, " --llama2c-output-model FNAME model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
fprintf(stderr, "\n");
@ -789,9 +824,12 @@ int main(int argc, char ** argv) {
if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
// read in the config header
if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
auto shared_weights = config.vocab_size > 0;
config.vocab_size = abs(config.vocab_size);
// read in the Transformer weights
malloc_weights(&weights, &config);
if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
malloc_weights(&weights, &config, shared_weights);
if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; }
fclose(file);
}

View file

@ -167,7 +167,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// TODO: Apply penalties
// float nl_logit = logits[llama_token_nl()];
// float nl_logit = logits[llama_token_nl(ctx)];
// auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
// llama_sample_repetition_penalty(ctx, &candidates_p,
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@ -176,7 +176,7 @@ llama_token sampling_id(struct MyModel* mymodel) {
// last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
// last_n_repeat, alpha_frequency, alpha_presence);
// if (!penalize_nl) {
// logits[llama_token_nl()] = nl_logit;
// logits[llama_token_nl(ctx)] = nl_logit;
// }
if (temp <= 0) {
@ -211,7 +211,7 @@ const char * sampling(struct MyModel * mymodel) {
llama_context * ctx = mymodel->ctx;
int id = sampling_id(mymodel);
static std::string ret;
if (id == llama_token_eos()) {
if (id == llama_token_eos(ctx)) {
ret = "</s>";
} else {
ret = llama_token_to_str(ctx, id);

1
examples/embd-input/embd_input.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import ctypes
from ctypes import cdll, c_char_p, c_void_p, POINTER, c_float, c_int
import numpy as np

1
examples/embd-input/llava.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))

1
examples/embd-input/minigpt4.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))

1
examples/embd-input/panda_gpt.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import sys
import os
sys.path.insert(0, os.path.dirname(__file__))

View file

@ -67,28 +67,35 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
}
fprintf(stderr, "\n");
}
if (params.embedding){
if (embd_inp.size() > 0) {
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
const int n_embd = llama_n_embd(ctx);
const auto embeddings = llama_get_embeddings(ctx);
for (int i = 0; i < n_embd; i++) {
printf("%f ", embeddings[i]);
}
printf("\n");
if (embd_inp.size() > (size_t)params.n_ctx) {
fprintf(stderr, "%s: error: prompt is longer than the context window (%zu tokens, n_ctx = %d)\n",
__func__, embd_inp.size(), params.n_ctx);
return 1;
}
while (!embd_inp.empty()) {
int n_tokens = std::min(params.n_batch, (int) embd_inp.size());
if (llama_eval(ctx, embd_inp.data(), n_tokens, n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
n_past += n_tokens;
embd_inp.erase(embd_inp.begin(), embd_inp.begin() + n_tokens);
}
const int n_embd = llama_n_embd(ctx);
const auto embeddings = llama_get_embeddings(ctx);
for (int i = 0; i < n_embd; i++) {
printf("%f ", embeddings[i]);
}
printf("\n");
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);

246
examples/gguf/gguf.cpp Normal file
View file

@ -0,0 +1,246 @@
#include "ggml.h"
#include "llama.h"
#include <cstdio>
#include <cinttypes>
#include <string>
#include <sstream>
#include <fstream>
#include <vector>
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
template<typename T>
static std::string to_string(const T & val) {
std::stringstream ss;
ss << val;
return ss.str();
}
bool gguf_ex_write(const std::string & fname) {
struct gguf_context * ctx = gguf_init_empty();
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
gguf_set_val_i8 (ctx, "some.parameter.int8", -0x13);
gguf_set_val_u16 (ctx, "some.parameter.uint16", 0x1234);
gguf_set_val_i16 (ctx, "some.parameter.int16", -0x1235);
gguf_set_val_u32 (ctx, "some.parameter.uint32", 0x12345678);
gguf_set_val_i32 (ctx, "some.parameter.int32", -0x12345679);
gguf_set_val_f32 (ctx, "some.parameter.float32", 0.123456789f);
gguf_set_val_bool(ctx, "some.parameter.bool", true);
gguf_set_val_str (ctx, "some.parameter.string", "hello world");
gguf_set_arr_data(ctx, "some.parameter.arr.i16", GGUF_TYPE_INT16, std::vector<int16_t>{ 1, 2, 3, 4, }.data(), 4);
gguf_set_arr_data(ctx, "some.parameter.arr.f32", GGUF_TYPE_FLOAT32, std::vector<float>{ 3.145f, 2.718f, 1.414f, }.data(), 3);
gguf_set_arr_str (ctx, "some.parameter.arr.str", std::vector<const char *>{ "hello", "world", "!" }.data(), 3);
struct ggml_init_params params = {
/*.mem_size =*/ 128ull*1024ull*1024ull,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx_data = ggml_init(params);
const int n_tensors = 10;
// tensor infos
for (int i = 0; i < n_tensors; ++i) {
const std::string name = "tensor_" + to_string(i);
int64_t ne[GGML_MAX_DIMS] = { 1 };
int32_t n_dims = rand() % GGML_MAX_DIMS + 1;
for (int j = 0; j < n_dims; ++j) {
ne[j] = rand() % 10 + 1;
}
struct ggml_tensor * cur = ggml_new_tensor(ctx_data, GGML_TYPE_F32, n_dims, ne);
ggml_set_name(cur, name.c_str());
{
float * data = (float *) cur->data;
for (int j = 0; j < ggml_nelements(cur); ++j) {
data[j] = 100 + i;
}
}
gguf_add_tensor(ctx, cur);
}
gguf_write_to_file(ctx, fname.c_str(), false);
fprintf(stdout, "%s: wrote file '%s;\n", __func__, fname.c_str());
ggml_free(ctx_data);
gguf_free(ctx);
return true;
}
// just read tensor info
bool gguf_ex_read_0(const std::string & fname) {
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ NULL,
};
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx));
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv
{
const int n_kv = gguf_get_n_kv(ctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
}
}
// find kv string
{
const char * findkey = "some.parameter.string";
const int keyidx = gguf_find_key(ctx, findkey);
if (keyidx == -1) {
fprintf(stdout, "%s: find key: %s not found.\n", __func__, findkey);
} else {
const char * key_value = gguf_get_val_str(ctx, keyidx);
fprintf(stdout, "%s: find key: %s found, kv[%d] value = %s\n", __func__, findkey, keyidx, key_value);
}
}
// tensor info
{
const int n_tensors = gguf_get_n_tensors(ctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
}
}
gguf_free(ctx);
return true;
}
// read and create ggml_context containing the tensors and their data
bool gguf_ex_read_1(const std::string & fname) {
struct ggml_context * ctx_data = NULL;
struct gguf_init_params params = {
/*.no_alloc = */ false,
/*.ctx = */ &ctx_data,
};
struct gguf_context * ctx = gguf_init_from_file(fname.c_str(), params);
fprintf(stdout, "%s: version: %d\n", __func__, gguf_get_version(ctx));
fprintf(stdout, "%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx));
fprintf(stdout, "%s: data offset: %zu\n", __func__, gguf_get_data_offset(ctx));
// kv
{
const int n_kv = gguf_get_n_kv(ctx);
fprintf(stdout, "%s: n_kv: %d\n", __func__, n_kv);
for (int i = 0; i < n_kv; ++i) {
const char * key = gguf_get_key(ctx, i);
fprintf(stdout, "%s: kv[%d]: key = %s\n", __func__, i, key);
}
}
// tensor info
{
const int n_tensors = gguf_get_n_tensors(ctx);
fprintf(stdout, "%s: n_tensors: %d\n", __func__, n_tensors);
for (int i = 0; i < n_tensors; ++i) {
const char * name = gguf_get_tensor_name (ctx, i);
const size_t offset = gguf_get_tensor_offset(ctx, i);
fprintf(stdout, "%s: tensor[%d]: name = %s, offset = %zu\n", __func__, i, name, offset);
}
}
// data
{
const int n_tensors = gguf_get_n_tensors(ctx);
for (int i = 0; i < n_tensors; ++i) {
fprintf(stdout, "%s: reading tensor %d data\n", __func__, i);
const char * name = gguf_get_tensor_name(ctx, i);
struct ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
fprintf(stdout, "%s: tensor[%d]: n_dims = %d, name = %s, data = %p\n", __func__, i, cur->n_dims, cur->name, cur->data);
// print first 10 elements
const float * data = (const float *) cur->data;
printf("%s data[:10] : ", name);
for (int j = 0; j < MIN(10, ggml_nelements(cur)); ++j) {
printf("%f ", data[j]);
}
printf("\n\n");
// check data
{
const float * data = (const float *) cur->data;
for (int j = 0; j < ggml_nelements(cur); ++j) {
if (data[j] != 100 + i) {
fprintf(stderr, "%s: tensor[%d]: data[%d] = %f\n", __func__, i, j, data[j]);
return false;
}
}
}
}
}
fprintf(stdout, "%s: ctx_data size: %zu\n", __func__, ggml_get_mem_size(ctx_data));
ggml_free(ctx_data);
gguf_free(ctx);
return true;
}
int main(int argc, char ** argv) {
if (argc < 3) {
fprintf(stdout, "usage: %s data.gguf r|w\n", argv[0]);
return -1;
}
const std::string fname(argv[1]);
const std::string mode (argv[2]);
GGML_ASSERT((mode == "r" || mode == "w") && "mode must be r or w");
if (mode == "w") {
GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file");
} else if (mode == "r") {
GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file");
GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file");
}
return 0;
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,423 +0,0 @@
#include "grammar-parser.h"
#include <cstdint>
#include <cwchar>
#include <string>
#include <utility>
#include <stdexcept>
#include <exception>
namespace grammar_parser {
// NOTE: assumes valid utf8 (but checks for overrun)
// copied from llama.cpp
std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t first_byte = static_cast<uint8_t>(*src);
uint8_t highbits = first_byte >> 4;
int len = lookup[highbits];
uint8_t mask = (1 << (8 - len)) - 1;
uint32_t value = first_byte & mask;
const char * end = src + len; // may overrun!
const char * pos = src + 1;
for ( ; pos < end && *pos; pos++) {
value = (value << 6) + (static_cast<uint8_t>(*pos) & 0x3F);
}
return std::make_pair(value, pos);
}
uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
return result.first->second;
}
uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
return next_id;
}
void add_rule(
parse_state & state,
uint32_t rule_id,
const std::vector<llama_grammar_element> & rule) {
if (state.rules.size() <= rule_id) {
state.rules.resize(rule_id + 1);
}
state.rules[rule_id] = rule;
}
bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
}
std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
const char * pos = src;
const char * end = src + size;
uint32_t value = 0;
for ( ; pos < end && *pos; pos++) {
value <<= 4;
char c = *pos;
if ('a' <= c && c <= 'f') {
value += c - 'a' + 10;
} else if ('A' <= c && c <= 'F') {
value += c - 'A' + 10;
} else if ('0' <= c && c <= '9') {
value += c - '0';
} else {
break;
}
}
if (pos != end) {
throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src);
}
return std::make_pair(value, pos);
}
const char * parse_space(const char * src, bool newline_ok) {
const char * pos = src;
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
if (*pos == '#') {
while (*pos && *pos != '\r' && *pos != '\n') {
pos++;
}
} else {
pos++;
}
}
return pos;
}
const char * parse_name(const char * src) {
const char * pos = src;
while (is_word_char(*pos)) {
pos++;
}
if (pos == src) {
throw std::runtime_error(std::string("expecting name at ") + src);
}
return pos;
}
std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') {
switch (src[1]) {
case 'x': return parse_hex(src + 2, 2);
case 'u': return parse_hex(src + 2, 4);
case 'U': return parse_hex(src + 2, 8);
case 't': return std::make_pair('\t', src + 2);
case 'r': return std::make_pair('\r', src + 2);
case 'n': return std::make_pair('\n', src + 2);
case '\\':
case '"':
case '[':
case ']':
return std::make_pair(src[1], src + 2);
default:
throw std::runtime_error(std::string("unknown escape at ") + src);
}
} else if (*src) {
return decode_utf8(src);
}
throw std::runtime_error("unexpected end of input");
}
const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested);
const char * parse_sequence(
parse_state & state,
const char * src,
const std::string & rule_name,
std::vector<llama_grammar_element> & out_elements,
bool is_nested) {
size_t last_sym_start = out_elements.size();
const char * pos = src;
while (*pos) {
if (*pos == '"') { // literal string
pos++;
last_sym_start = out_elements.size();
while (*pos != '"') {
auto char_pair = parse_char(pos);
pos = char_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '[') { // char range(s)
pos++;
enum llama_gretype start_type = LLAMA_GRETYPE_CHAR;
if (*pos == '^') {
pos++;
start_type = LLAMA_GRETYPE_CHAR_NOT;
}
last_sym_start = out_elements.size();
while (*pos != ']') {
auto char_pair = parse_char(pos);
pos = char_pair.second;
enum llama_gretype type = last_sym_start < out_elements.size()
? LLAMA_GRETYPE_CHAR_ALT
: start_type;
out_elements.push_back({type, char_pair.first});
if (pos[0] == '-' && pos[1] != ']') {
auto endchar_pair = parse_char(pos + 1);
pos = endchar_pair.second;
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});
}
}
pos = parse_space(pos + 1, is_nested);
} else if (is_word_char(*pos)) { // rule reference
const char * name_end = parse_name(pos);
uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos);
pos = parse_space(name_end, is_nested);
last_sym_start = out_elements.size();
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id});
} else if (*pos == '(') { // grouping
// parse nested alternates into synthesized rule
pos = parse_space(pos + 1, true);
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
pos = parse_alternates(state, pos, rule_name, sub_rule_id, true);
last_sym_start = out_elements.size();
// output reference to synthesized rule
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
if (*pos != ')') {
throw std::runtime_error(std::string("expecting ')' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
}
// apply transformation to previous symbol (last_sym_start to end) according to
// rewrite rules:
// S* --> S' ::= S S' |
// S+ --> S' ::= S S' | S
// S? --> S' ::= S |
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
std::vector<llama_grammar_element> sub_rule;
// add preceding symbol to generated rule
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
if (*pos == '*' || *pos == '+') {
// cause generated rule to recurse
sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
}
// mark start of alternate def
sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
if (*pos == '+') {
// add preceding symbol as alternate only for '+' (otherwise empty)
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
}
sub_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, sub_rule_id, sub_rule);
// in original rule, replace previous symbol with reference to generated rule
out_elements.resize(last_sym_start);
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
pos = parse_space(pos + 1, is_nested);
} else {
break;
}
}
return pos;
}
const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested) {
std::vector<llama_grammar_element> rule;
const char * pos = parse_sequence(state, src, rule_name, rule, is_nested);
while (*pos == '|') {
rule.push_back({LLAMA_GRETYPE_ALT, 0});
pos = parse_space(pos + 1, true);
pos = parse_sequence(state, pos, rule_name, rule, is_nested);
}
rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, rule_id, rule);
return pos;
}
const char * parse_rule(parse_state & state, const char * src) {
const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src;
uint32_t rule_id = get_symbol_id(state, src, name_len);
const std::string name(src, name_len);
if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) {
throw std::runtime_error(std::string("expecting ::= at ") + pos);
}
pos = parse_space(pos + 3, true);
pos = parse_alternates(state, pos, name, rule_id, false);
if (*pos == '\r') {
pos += pos[1] == '\n' ? 2 : 1;
} else if (*pos == '\n') {
pos++;
} else if (*pos) {
throw std::runtime_error(std::string("expecting newline or end at ") + pos);
}
return parse_space(pos, true);
}
parse_state parse(const char * src) {
try {
parse_state state;
const char * pos = parse_space(src, true);
while (*pos) {
pos = parse_rule(state, pos);
}
return state;
} catch (const std::exception & err) {
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
return parse_state();
}
}
void print_grammar_char(FILE * file, uint32_t c) {
if (0x20 <= c && c <= 0x7f) {
fprintf(file, "%c", static_cast<char>(c));
} else {
// cop out of encoding UTF-8
fprintf(file, "<U+%04X>", c);
}
}
bool is_char_element(llama_grammar_element elem) {
switch (elem.type) {
case LLAMA_GRETYPE_CHAR: return true;
case LLAMA_GRETYPE_CHAR_NOT: return true;
case LLAMA_GRETYPE_CHAR_ALT: return true;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
default: return false;
}
}
void print_rule_binary(FILE * file, const std::vector<llama_grammar_element> & rule) {
for (auto elem : rule) {
switch (elem.type) {
case LLAMA_GRETYPE_END: fprintf(file, "END"); break;
case LLAMA_GRETYPE_ALT: fprintf(file, "ALT"); break;
case LLAMA_GRETYPE_RULE_REF: fprintf(file, "RULE_REF"); break;
case LLAMA_GRETYPE_CHAR: fprintf(file, "CHAR"); break;
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
}
switch (elem.type) {
case LLAMA_GRETYPE_END:
case LLAMA_GRETYPE_ALT:
case LLAMA_GRETYPE_RULE_REF:
fprintf(file, "(%u) ", elem.value);
break;
case LLAMA_GRETYPE_CHAR:
case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ALT:
fprintf(file, "(\"");
print_grammar_char(file, elem.value);
fprintf(file, "\") ");
break;
}
}
fprintf(file, "\n");
}
void print_rule(
FILE * file,
uint32_t rule_id,
const std::vector<llama_grammar_element> & rule,
const std::map<uint32_t, std::string> & symbol_id_names) {
if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) {
throw std::runtime_error(
"malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id));
}
fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str());
for (size_t i = 0, end = rule.size() - 1; i < end; i++) {
llama_grammar_element elem = rule[i];
switch (elem.type) {
case LLAMA_GRETYPE_END:
throw std::runtime_error(
"unexpected end of rule: " + std::to_string(rule_id) + "," +
std::to_string(i));
case LLAMA_GRETYPE_ALT:
fprintf(file, "| ");
break;
case LLAMA_GRETYPE_RULE_REF:
fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str());
break;
case LLAMA_GRETYPE_CHAR:
fprintf(file, "[");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_NOT:
fprintf(file, "[^");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
if (i == 0 || !is_char_element(rule[i - 1])) {
throw std::runtime_error(
"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " +
std::to_string(rule_id) + "," + std::to_string(i));
}
fprintf(file, "-");
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_ALT:
if (i == 0 || !is_char_element(rule[i - 1])) {
throw std::runtime_error(
"LLAMA_GRETYPE_CHAR_ALT without preceding char: " +
std::to_string(rule_id) + "," + std::to_string(i));
}
print_grammar_char(file, elem.value);
break;
}
if (is_char_element(elem)) {
switch (rule[i + 1].type) {
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
break;
default:
fprintf(file, "] ");
}
}
}
fprintf(file, "\n");
}
void print_grammar(FILE * file, const parse_state & state) {
try {
std::map<uint32_t, std::string> symbol_id_names;
for (auto kv : state.symbol_ids) {
symbol_id_names[kv.second] = kv.first;
}
for (size_t i = 0, end = state.rules.size(); i < end; i++) {
// fprintf(file, "%zu: ", i);
// print_rule_binary(file, state.rules[i]);
print_rule(file, uint32_t(i), state.rules[i], symbol_id_names);
// fprintf(file, "\n");
}
} catch (const std::exception & err) {
fprintf(stderr, "\n%s: error printing grammar: %s\n", __func__, err.what());
}
}
std::vector<const llama_grammar_element *> parse_state::c_rules() {
std::vector<const llama_grammar_element *> ret;
for (const auto & rule : rules) {
ret.push_back(rule.data());
}
return ret;
}
}

View file

@ -1,29 +0,0 @@
// Implements a parser for an extended Backus-Naur form (BNF), producing the
// binary context-free grammar format specified by llama.h. Supports character
// ranges, grouping, and repetition operators. As an example, a grammar for
// arithmetic might look like:
//
// root ::= expr
// expr ::= term ([-+*/] term)*
// term ::= num | "(" space expr ")" space
// num ::= [0-9]+ space
// space ::= [ \t\n]*
#pragma once
#include "llama.h"
#include <vector>
#include <map>
#include <cstdint>
#include <string>
namespace grammar_parser {
struct parse_state {
std::map<std::string, uint32_t> symbol_ids;
std::vector<std::vector<llama_grammar_element>> rules;
std::vector<const llama_grammar_element *> c_rules();
};
parse_state parse(const char * src);
void print_grammar(FILE * file, const parse_state & state);
}

1
examples/jeopardy/graph.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import matplotlib.pyplot as plt
import os
import csv

0
examples/jeopardy/jeopardy.sh Normal file → Executable file
View file

1
examples/json-schema-to-grammar.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import argparse
import json
import re

View file

@ -148,7 +148,7 @@ struct cmd_params {
};
static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.bin"},
/* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512},
/* n_gen */ {128},
/* n_batch */ {512},
@ -179,12 +179,12 @@ static void print_usage(int /* argc */, char ** argv) {
fprintf(stdout, " -mg i, --main-gpu <n> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
fprintf(stdout, " -lv, --low-vram <0|1> (default: %s)\n", join(cmd_params_defaults.low_vram, ",").c_str());
fprintf(stdout, " -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str());
fprintf(stdout, " -ts, --tensor_split <ts> \n");
fprintf(stdout, " -ts, --tensor_split <ts0/ts1/..> \n");
fprintf(stdout, " -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
fprintf(stdout, " -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : "md");
fprintf(stdout, " -o, --output <csv|json|md|sql> (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql");
fprintf(stdout, " -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
fprintf(stdout, "\n");
fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by repeating the parameter.\n");
fprintf(stdout, "Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
}
@ -606,6 +606,8 @@ const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info();
struct printer {
virtual ~printer() {}
FILE * fout;
virtual void print_header(const cmd_params & params) { (void) params; };
virtual void print_test(const test & t) = 0;
@ -726,7 +728,7 @@ struct markdown_printer : public printer {
if (!is_cpu_backend) {
fields.push_back("n_gpu_layers");
}
if (params.n_batch.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) {
fields.push_back("n_threads");
}
if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
@ -849,7 +851,7 @@ struct sql_printer : public printer {
};
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
std::vector<llama_token> tokens(n_batch, llama_token_bos());
std::vector<llama_token> tokens(n_batch, llama_token_bos(ctx));
int n_processed = 0;
while (n_processed < n_prompt) {
int n_tokens = std::min(n_prompt - n_processed, n_batch);
@ -859,7 +861,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
}
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
llama_token token = llama_token_bos();
llama_token token = llama_token_bos(ctx);
for (int i = 0; i < n_gen; i++) {
llama_eval(ctx, &token, 1, n_past + i, n_threads);
}

View file

@ -288,6 +288,10 @@ These options help improve the performance and memory usage of the LLaMA models.
- `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation.
### Grammars
- `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax.
### Quantization
For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).

View file

@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
{
fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos());
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
}
@ -191,16 +191,17 @@ int main(int argc, char ** argv) {
// tokenize the prompt
std::vector<llama_token> embd_inp;
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
embd_inp = ::llama_tokenize(ctx, params.prompt, true);
} else {
embd_inp = session_tokens;
}
// Should not run without any tokens
if (embd_inp.empty()) {
embd_inp.push_back(llama_token_bos(ctx));
}
// Tokenize negative prompt
std::vector<llama_token> guidance_inp;
int guidance_offset = 0;
@ -270,15 +271,12 @@ int main(int argc, char ** argv) {
params.interactive = true;
}
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
if (params.verbose_prompt) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]).c_str());
}
if (ctx_guidance) {
@ -286,14 +284,14 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str());
fprintf(stderr, "%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]));
fprintf(stderr, "%6d -> '%s'\n", guidance_inp[i], llama_token_to_str(ctx, guidance_inp[i]).c_str());
}
}
if (params.n_keep > 0) {
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]).c_str());
}
fprintf(stderr, "'\n");
}
@ -311,7 +309,7 @@ int main(int argc, char ** argv) {
auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false;
};
SetConsoleCtrlHandler(static_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
#endif
fprintf(stderr, "%s: interactive mode on.\n", __func__);
@ -352,10 +350,9 @@ int main(int argc, char ** argv) {
fprintf(stderr, "\n");
{
auto it = params.logit_bias.find(llama_token_eos());
auto it = params.logit_bias.find(llama_token_eos(ctx));
if (it != params.logit_bias.end() && it->second == -INFINITY) {
fprintf(stderr,
"%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
fprintf(stderr, "%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__);
}
}
@ -405,7 +402,7 @@ int main(int argc, char ** argv) {
// do one empty run to warm up the model
{
const std::vector<llama_token> tmp = { llama_token_bos(), };
const std::vector<llama_token> tmp = { llama_token_bos(ctx), };
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
llama_reset_timings(ctx);
}
@ -589,7 +586,7 @@ int main(int argc, char ** argv) {
}
// Apply penalties
float nl_logit = logits[llama_token_nl()];
float nl_logit = logits[llama_token_nl(ctx)];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &candidates_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@ -598,7 +595,7 @@ int main(int argc, char ** argv) {
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl) {
logits[llama_token_nl()] = nl_logit;
logits[llama_token_nl(ctx)] = nl_logit;
}
if (grammar != NULL) {
@ -672,7 +669,7 @@ int main(int argc, char ** argv) {
// display text
if (input_echo) {
for (auto id : embd) {
printf("%s", llama_token_to_str(ctx, id));
printf("%s", llama_token_to_str(ctx, id).c_str());
}
fflush(stdout);
}
@ -714,7 +711,7 @@ int main(int argc, char ** argv) {
}
// deal with end of text token in interactive mode
if (last_n_tokens.back() == llama_token_eos()) {
if (last_n_tokens.back() == llama_token_eos(ctx)) {
if (params.interactive) {
if (params.antiprompt.size() != 0) {
// tokenize and inject first reverse prompt
@ -739,7 +736,7 @@ int main(int argc, char ** argv) {
}
if (params.input_prefix_bos) {
embd_inp.push_back(llama_token_bos());
embd_inp.push_back(llama_token_bos(ctx));
}
std::string buffer;
@ -795,8 +792,7 @@ int main(int argc, char ** argv) {
if (grammar != NULL) {
llama_grammar_free(grammar);
std::vector<const llama_grammar_element *> grammar_rules(
parsed_grammar.c_rules());
std::vector<const llama_grammar_element *> grammar_rules( parsed_grammar.c_rules());
grammar = llama_grammar_init(
grammar_rules.data(), grammar_rules.size(),
parsed_grammar.symbol_ids.at("root"));
@ -807,7 +803,7 @@ int main(int argc, char ** argv) {
}
// end of text token
if (!embd.empty() && embd.back() == llama_token_eos() && !(params.instruct || params.interactive)) {
if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) {
fprintf(stderr, " [end of text]\n");
break;
}

1
examples/make-ggml.py Normal file → Executable file
View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
"""
This script converts Hugging Face llama models to GGML and quantizes them.

View file

@ -2,7 +2,7 @@
//
// - First, export a LLaMA graph:
//
// $ ./bin/main -m ../models/7B/ggml-model-q4_0.bin --export
// $ ./bin/main -m ../models/7B/ggml-model-q4_0.gguf --export
//
// - Run this tool to evaluate the exported graph:
//

View file

@ -27,7 +27,121 @@ std::vector<float> softmax(const std::vector<float>& logits) {
return probs;
}
void perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
// BOS tokens will be added for each chunk before eval
if (params.ppl_stride <= 0) {
fprintf(stderr, "%s: stride is %d but must be greater than zero!\n",__func__,params.ppl_stride);
return;
}
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
const int calc_chunk = params.n_ctx;
fprintf(stderr, "%s: have %zu tokens. Calculation chunk = %d\n", __func__, tokens.size(), calc_chunk);
if (int(tokens.size()) <= calc_chunk) {
fprintf(stderr, "%s: there are only %zu tokens, this is not enough for a context size of %d and stride %d\n",__func__,
tokens.size(), params.n_ctx, params.ppl_stride);
return;
}
const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
const int n_vocab = llama_n_vocab(ctx);
const int n_batch = params.n_batch;
int count = 0;
double nll = 0.0;
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
for (int i = 0; i < n_chunk; ++i) {
const int start = i * params.ppl_stride;
const int end = start + calc_chunk;
const int num_batches = (calc_chunk + n_batch - 1) / n_batch;
//fprintf(stderr, "%s: evaluating %d...%d using %d batches\n", __func__, start, end, num_batches);
std::vector<float> logits;
const auto t_start = std::chrono::high_resolution_clock::now();
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
const int batch_size = std::min(end - batch_start, n_batch);
//fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
//fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
// save original token and restore it after eval
const auto token_org = tokens[batch_start];
// add BOS token for the first batch of each chunk
if (j == 0) {
tokens[batch_start] = llama_token_bos(ctx);
}
const auto batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
if (j == 0) {
tokens[batch_start] = token_org;
}
}
const auto t_end = std::chrono::high_resolution_clock::now();
if (i == 0) {
const float t_total = std::chrono::duration<float>(t_end - t_start).count();
fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
int total_seconds = (int)(t_total * n_chunk);
if (total_seconds >= 60*60) {
fprintf(stderr, "%d hours ", total_seconds / (60*60));
total_seconds = total_seconds % (60*60);
}
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
//fprintf(stderr, "%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
for (int j = params.n_ctx - params.ppl_stride - 1; j < params.n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
const std::vector<float> tok_logits(
logits.begin() + (j + 0) * n_vocab,
logits.begin() + (j + 1) * n_vocab);
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob);
++count;
}
// perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
} else {
printf("%8d %.4lf\n", i*params.ppl_stride, std::exp(nll / count));
}
fflush(stdout);
}
printf("\n");
}
void perplexity(llama_context * ctx, const gpt_params & params) {
if (params.ppl_stride > 0) {
perplexity_v2(ctx, params);
return;
}
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
@ -64,7 +178,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
// add BOS token for the first batch of each chunk
if (j == 0) {
tokens[batch_start] = llama_token_bos();
tokens[batch_start] = llama_token_bos(ctx);
}
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
@ -116,7 +230,11 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
++count;
}
// perplexity is e^(average negative log-likelihood)
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
} else {
printf("%8d %.4lf\n", i*params.n_ctx, std::exp(nll / count));
}
fflush(stdout);
}
printf("\n");
@ -369,6 +487,12 @@ int main(int argc, char ** argv) {
params.perplexity = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
if (params.ppl_stride > 0) {
fprintf(stderr, "Will perform strided perplexity calculation -> adjusting context size from %d to %d\n",
params.n_ctx, params.n_ctx + params.ppl_stride/2);
params.n_ctx += params.ppl_stride/2;
}
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);

View file

@ -24,7 +24,7 @@
#endif
struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.bin";
std::string model = "models/7B/ggml-model-f16.gguf";
bool verbose = false;
bool per_layer_stats = false;
bool print_histogram = false;

View file

@ -14,25 +14,25 @@ struct quant_option {
};
static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.50G, +0.2499 ppl @ 7B", },
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1846 ppl @ 7B", },
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.30G, +0.0796 ppl @ 7B", },
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0415 ppl @ 7B", },
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
#ifdef GGML_USE_K_QUANTS
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.67G, +0.8698 ppl @ 7B", },
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5505 ppl @ 7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.06G, +0.2437 ppl @ 7B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1803 ppl @ 7B", },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.56G, +0.1149 ppl @ 7B", },
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0535 ppl @ 7B", },
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0353 ppl @ 7B", },
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0142 ppl @ 7B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0044 ppl @ 7B", },
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
#endif
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ 7B", },
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
};
@ -68,10 +68,10 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
}
// usage:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
//
void usage(const char * executable) {
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.bin [model-quant.bin] type [nthreads]\n\n", executable);
fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
fprintf(stderr, "\nAllowed quantization types:\n");
@ -118,8 +118,8 @@ int main(int argc, char ** argv) {
if (pos != std::string::npos) {
fpath = fname_inp.substr(0, pos + 1);
}
// export as [inp path]/ggml-model-[ftype].bin
fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
// export as [inp path]/ggml-model-[ftype].gguf
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
arg_idx++;
}
else {

View file

@ -1,4 +1,3 @@
#!/bin/bash
cd `dirname $0`

View file

@ -26,7 +26,6 @@ int main(int argc, char ** argv) {
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_gqa = params.n_gqa;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
@ -45,9 +44,8 @@ int main(int argc, char ** argv) {
llama_free_model(model);
return 1;
}
auto tokens = std::vector<llama_token>(params.n_ctx);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
auto tokens = llama_tokenize(ctx, params.prompt.c_str(), true);
auto n_prompt_tokens = tokens.size();
if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
llama_free(ctx);
@ -92,7 +90,7 @@ int main(int argc, char ** argv) {
auto next_token_str = llama_token_to_str(ctx, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str);
printf("%s", next_token_str.c_str());
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx);
@ -152,7 +150,7 @@ int main(int argc, char ** argv) {
auto next_token_str = llama_token_to_str(ctx2, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str);
printf("%s", next_token_str.c_str());
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
llama_free(ctx2);

0
examples/server-llama2-13B.sh Normal file → Executable file
View file

View file

@ -5,7 +5,7 @@ This example demonstrates a simple HTTP API server and a simple web front end to
Command line options:
- `--threads N`, `-t N`: Set the number of threads to use during computation.
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
@ -48,15 +48,14 @@ To get started right away, run the following command, making sure to use the cor
### Unix-based systems (Linux, macOS, etc.):
```bash
./server -m models/7B/ggml-model.bin -c 2048
./server -m models/7B/ggml-model.gguf -c 2048
```
### Windows:
```powershell
server.exe -m models\7B\ggml-model.bin -c 2048
server.exe -m models\7B\ggml-model.gguf -c 2048
```
The above command will start a server that by default listens on `127.0.0.1:8080`.
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
@ -127,7 +126,7 @@ node .
`stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`.
`prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does.
`prompt`: Provide a prompt as a string, or as an array of strings and numbers representing tokens. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. If the prompt is a string, or an array with the first element given as a string, a space is inserted in the front like main.cpp does.
`stop`: Specify a JSON array of stopping strings.
These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []).

View file

@ -1,3 +1,4 @@
#!/usr/bin/env python3
import argparse
from flask import Flask, jsonify, request, Response
import urllib.parse

0
examples/server/chat-llama2.sh Normal file → Executable file
View file

0
examples/server/chat.sh Normal file → Executable file
View file

View file

@ -190,6 +190,7 @@ struct llama_server_context
size_t n_past = 0;
size_t n_remain = 0;
json prompt;
std::vector<llama_token> embd;
std::vector<llama_token> last_n_tokens;
@ -267,6 +268,53 @@ struct llama_server_context
return true;
}
std::vector<llama_token> tokenize(json json_prompt, bool add_bos)
{
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
// or the first element of the json_prompt array is a string.
std::vector<llama_token> prompt_tokens;
if (json_prompt.is_array())
{
bool first = true;
for (const auto& p : json_prompt)
{
if (p.is_string())
{
auto s = p.template get<std::string>();
std::vector<llama_token> p;
if (first)
{
s.insert(0, 1, ' '); // add a space if it's the first
p = ::llama_tokenize(ctx, s, add_bos);
first = false;
}
else
{
p = ::llama_tokenize(ctx, s, false);
}
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
}
else
{
if (first)
{
first = false;
}
prompt_tokens.push_back(p.template get<llama_token>());
}
}
}
else
{
auto s = json_prompt.template get<std::string>();
s.insert(0, 1, ' '); // always add a first space
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
}
return prompt_tokens;
}
bool loadGrammar()
{
if (!params.grammar.empty()) {
@ -279,7 +327,7 @@ struct llama_server_context
grammar_parser::print_grammar(stderr, parsed_grammar);
{
auto it = params.logit_bias.find(llama_token_eos());
auto it = params.logit_bias.find(llama_token_eos(ctx));
if (it != params.logit_bias.end() && it->second == -INFINITY) {
LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {});
}
@ -294,8 +342,8 @@ struct llama_server_context
void loadPrompt()
{
params.prompt.insert(0, 1, ' '); // always add a first space
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
auto prompt_tokens = tokenize(prompt, true); // always add BOS
num_prompt_tokens = prompt_tokens.size();
if (params.n_keep < 0)
@ -402,7 +450,7 @@ struct llama_server_context
if (params.n_predict == 0)
{
has_next_token = false;
result.tok = llama_token_eos();
result.tok = llama_token_eos(ctx);
return result;
}
@ -442,7 +490,7 @@ struct llama_server_context
llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
// Apply penalties
float nl_logit = logits[llama_token_nl()];
float nl_logit = logits[llama_token_nl(ctx)];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
llama_sample_repetition_penalty(ctx, &candidates_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
@ -452,7 +500,7 @@ struct llama_server_context
last_n_repeat, alpha_frequency, alpha_presence);
if (!penalize_nl)
{
logits[llama_token_nl()] = nl_logit;
logits[llama_token_nl(ctx)] = nl_logit;
}
if (grammar != nullptr) {
@ -515,7 +563,7 @@ struct llama_server_context
// decrement remaining sampling budget
--n_remain;
if (!embd.empty() && embd.back() == llama_token_eos())
if (!embd.empty() && embd.back() == llama_token_eos(ctx))
{
// stopping_word = llama_token_to_str(ctx, embd.back());
has_next_token = false;
@ -652,8 +700,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
fprintf(stdout, " -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
fprintf(stdout, " -eps N, --rms-norm-eps N rms norm eps (TEMP!!! use 1e-5 for LLaMAv2) (default: %.1e)\n", params.rms_norm_eps);
fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
@ -673,12 +719,11 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
fprintf(stdout, " number of layers to store in VRAM\n");
fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
#endif
fprintf(stdout, " -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
@ -774,23 +819,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
params.n_ctx = std::stoi(argv[i]);
}
else if (arg == "-gqa" || arg == "--gqa")
{
if (++i >= argc)
{
invalid_param = true;
break;
}
params.n_gqa = std::stoi(argv[i]);
}
else if (arg == "-eps" || arg == "--rms-norm-eps") {
if (++i >= argc)
{
invalid_param = true;
break;
}
params.rms_norm_eps = std::stof(argv[i]);
}
else if (arg == "--rope-freq-base")
{
if (++i >= argc)
@ -886,12 +914,12 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
#endif // GGML_USE_CUBLAS
}
else if (arg == "--mul-mat-q" || arg == "-mmq")
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
{
#ifdef GGML_USE_CUBLAS
params.mul_mat_q = true;
params.mul_mat_q = false;
#else
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
#endif // GGML_USE_CUBLAS
}
else if (arg == "--main-gpu" || arg == "-mg")
@ -968,7 +996,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
static json format_generation_settings(llama_server_context &llama)
{
const auto eos_bias = llama.params.logit_bias.find(llama_token_eos());
const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx));
const bool ignore_eos = eos_bias != llama.params.logit_bias.end() &&
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
@ -1036,7 +1064,7 @@ static json format_final_response(llama_server_context &llama, const std::string
{"tokens_predicted", llama.num_tokens_predicted},
{"tokens_evaluated", llama.num_prompt_tokens},
{"generation_settings", format_generation_settings(llama)},
{"prompt", llama.params.prompt},
{"prompt", llama.prompt},
{"truncated", llama.truncated},
{"stopped_eos", llama.stopped_eos},
{"stopped_word", llama.stopped_word},
@ -1075,35 +1103,52 @@ static json format_tokenizer_response(const std::vector<llama_token> &tokens)
{"tokens", tokens}};
}
template <typename T>
static T json_value(const json &body, const std::string &key, const T &default_value)
{
// Fallback null to default value
return body.contains(key) && !body.at(key).is_null()
? body.value(key, default_value)
: default_value;
}
static void parse_options_completion(const json &body, llama_server_context &llama)
{
gpt_params default_params;
llama.stream = body.value("stream", false);
llama.params.n_predict = body.value("n_predict", default_params.n_predict);
llama.params.top_k = body.value("top_k", default_params.top_k);
llama.params.top_p = body.value("top_p", default_params.top_p);
llama.params.tfs_z = body.value("tfs_z", default_params.tfs_z);
llama.params.typical_p = body.value("typical_p", default_params.typical_p);
llama.params.repeat_last_n = body.value("repeat_last_n", default_params.repeat_last_n);
llama.params.temp = body.value("temperature", default_params.temp);
llama.params.repeat_penalty = body.value("repeat_penalty", default_params.repeat_penalty);
llama.params.presence_penalty = body.value("presence_penalty", default_params.presence_penalty);
llama.params.frequency_penalty = body.value("frequency_penalty", default_params.frequency_penalty);
llama.params.mirostat = body.value("mirostat", default_params.mirostat);
llama.params.mirostat_tau = body.value("mirostat_tau", default_params.mirostat_tau);
llama.params.mirostat_eta = body.value("mirostat_eta", default_params.mirostat_eta);
llama.params.penalize_nl = body.value("penalize_nl", default_params.penalize_nl);
llama.params.n_keep = body.value("n_keep", default_params.n_keep);
llama.params.seed = body.value("seed", default_params.seed);
llama.params.prompt = body.value("prompt", default_params.prompt);
llama.params.grammar = body.value("grammar", default_params.grammar);
llama.params.n_probs = body.value("n_probs", default_params.n_probs);
llama.stream = json_value(body, "stream", false);
llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict);
llama.params.top_k = json_value(body, "top_k", default_params.top_k);
llama.params.top_p = json_value(body, "top_p", default_params.top_p);
llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z);
llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p);
llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n);
llama.params.temp = json_value(body, "temperature", default_params.temp);
llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty);
llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty);
llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty);
llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat);
llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau);
llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta);
llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl);
llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep);
llama.params.seed = json_value(body, "seed", default_params.seed);
llama.params.grammar = json_value(body, "grammar", default_params.grammar);
llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs);
if (body.count("prompt") != 0)
{
llama.prompt = body["prompt"];
}
else
{
llama.prompt = "";
}
llama.params.logit_bias.clear();
if (body.value("ignore_eos", false))
if (json_value(body, "ignore_eos", false))
{
llama.params.logit_bias[llama_token_eos()] = -INFINITY;
llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY;
}
const auto &logit_bias = body.find("logit_bias");
@ -1356,8 +1401,11 @@ int main(int argc, char **argv)
auto lock = llama.lock();
const json body = json::parse(req.body);
const std::string content = body.value("content", "");
const std::vector<llama_token> tokens = llama_tokenize(llama.ctx, content, false);
std::vector<llama_token> tokens;
if (body.count("content") != 0)
{
tokens = llama.tokenize(body["content"], false);
}
const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json"); });
@ -1369,7 +1417,14 @@ int main(int argc, char **argv)
llama.rewind();
llama_reset_timings(llama.ctx);
llama.params.prompt = body.value("content", "");
if (body.count("content") != 0)
{
llama.prompt = body["content"];
}
else
{
llama.prompt = "";
}
llama.params.n_predict = 0;
llama.loadPrompt();
llama.beginCompletion();
@ -1398,7 +1453,7 @@ int main(int argc, char **argv)
{
if (res.status == 400) {
res.set_content("Invalid request", "text/plain");
} else {
} else if (res.status != 500) {
res.set_content("File Not Found", "text/plain");
res.status = 404;
} });

View file

@ -2,180 +2,129 @@
#define _GNU_SOURCE
#endif
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cassert>
#include <cinttypes>
#include "common.h"
#include "llama.h"
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
int main(int argc, char ** argv)
{
int main(int argc, char ** argv) {
gpt_params params;
//---------------------------------
// Print help :
//---------------------------------
if ( argc == 1 || argv[1][0] == '-' )
{
printf( "usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT]\n" , argv[0]);
return 1 ;
}
//---------------------------------
// Load parameters :
//---------------------------------
if ( argc >= 2 )
{
if (argc >= 2) {
params.model = argv[1];
}
if ( argc >= 3 )
{
if (argc >= 3) {
params.prompt = argv[2];
}
if ( params.prompt.empty() )
{
if (params.prompt.empty()) {
params.prompt = "Hello my name is";
}
//---------------------------------
// Init LLM :
//---------------------------------
// init LLM
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
llama_context_params ctx_params = llama_context_default_params();
std::tie(model, ctx) = llama_init_from_gpt_params( params );
llama_model * model = llama_load_model_from_file(params.model.c_str(), ctx_params);
if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}
//---------------------------------
// Tokenize the prompt :
//---------------------------------
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
// tokenize the prompt
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize( ctx , params.prompt , true );
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
const int max_context_size = llama_n_ctx( ctx );
const int max_tokens_list_size = max_context_size - 4 ;
const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4;
if ( (int)tokens_list.size() > max_tokens_list_size )
{
fprintf( stderr , "%s: error: prompt too long (%d tokens, max %d)\n" ,
__func__ , (int)tokens_list.size() , max_tokens_list_size );
if ((int) tokens_list.size() > max_tokens_list_size) {
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) tokens_list.size(), max_tokens_list_size);
return 1;
}
fprintf( stderr, "\n\n" );
fprintf(stderr, "\n\n");
// Print the tokens from the prompt :
for( auto id : tokens_list )
{
printf( "%s" , llama_token_to_str( ctx , id ) );
for (auto id : tokens_list) {
fprintf(stderr, "%s", llama_token_to_str(ctx, id).c_str());
}
fflush(stdout);
fflush(stderr);
//---------------------------------
// Main prediction loop :
//---------------------------------
// main loop
// The LLM keeps a contextual cache memory of previous token evaluation.
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
{
//---------------------------------
// Evaluate the tokens :
//---------------------------------
const int n_gen = std::min(32, max_context_size);
if ( llama_eval( ctx , tokens_list.data() , int(tokens_list.size()) , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
{
fprintf( stderr, "%s : failed to eval\n" , __func__ );
while (llama_get_kv_cache_token_count(ctx) < n_gen) {
// evaluate the transformer
if (llama_eval(ctx, tokens_list.data(), int(tokens_list.size()), llama_get_kv_cache_token_count(ctx), params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
tokens_list.clear();
//---------------------------------
// Select the best prediction :
//---------------------------------
// sample the next token
llama_token new_token_id = 0;
auto logits = llama_get_logits( ctx );
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
std::vector<llama_token_data> candidates;
candidates.reserve( n_vocab );
candidates.reserve(n_vocab);
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
{
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// Select it using the "Greedy sampling" method :
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
new_token_id = llama_sample_token_greedy(ctx , &candidates_p);
// is it an end of stream ?
if ( new_token_id == llama_token_eos() )
{
if (new_token_id == llama_token_eos(ctx)) {
fprintf(stderr, " [end of text]\n");
break;
}
// Print the new token :
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
fflush( stdout );
// print the new token :
printf("%s", llama_token_to_str(ctx, new_token_id).c_str());
fflush(stdout);
// Push this new token for next evaluation :
tokens_list.push_back( new_token_id );
// push this new token for next evaluation
tokens_list.push_back(new_token_id);
}
} // wend of main loop
llama_free( ctx );
llama_free_model( model );
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
fprintf(stderr, "\n\n");
return 0;
}
// EOF

View file

@ -1,4 +1,5 @@
#include "ggml.h"
#include "common.h"
#include "llama.h"
#include <unordered_map>
#include <vector>
@ -16,7 +17,7 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
static const float rms_norm_eps = 1e-5f;
struct random_normal_distribution {
std::mt19937 gen;
@ -169,14 +170,16 @@ struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struc
struct llama_vocab {
using id = int32_t;
using token = std::string;
using ttype = llama_token_type;
struct token_score {
token tok;
struct token_data {
token text;
float score;
ttype type;
};
std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
std::vector<token_data> id_to_token;
};
struct my_llama_hparams {
@ -1865,10 +1868,10 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
t12->grad = expand(gb, ggml_permute(ctx0, t15->grad, 0, 2, 3, 1)); assert_shape_4d(t12->grad, N, n_batch, n_embd/n_head, n_head);
t11->grad = expand(gb, ggml_reshape_2d(ctx0, ggml_cont(ctx0, t12->grad), N*n_batch, n_embd)); assert_shape_2d(t11->grad, N*n_batch, n_embd);
t10->grad = expand(gb, ggml_permute(ctx0, t14->grad, 0, 2, 1, 3)); assert_shape_4d(t10->grad, n_embd/n_head, n_head, N, n_batch);
t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
t09->grad = expand(gb, ggml_rope_back(ctx0, t10->grad, n_past, n_rot, rope_mode, n_ctx, 10000.0f, 1.0f, 0.0f, false)); assert_shape_4d(t09->grad, n_embd/n_head, n_head, N, n_batch);
t08->grad = expand(gb, ggml_reshape_2d(ctx0, t09->grad, n_embd, N*n_batch)); assert_shape_2d(t08->grad, n_embd, N*n_batch);
t07->grad = expand(gb, ggml_permute(ctx0, t13->grad, 0, 2, 1, 3)); assert_shape_4d(t07->grad, n_embd/n_head, n_head, N, n_batch);
t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
t06->grad = expand(gb, ggml_rope_back(ctx0, t07->grad, n_past, n_rot, rope_mode, n_ctx, 10000.0f, 1.0f, 0.0f, false)); assert_shape_4d(t06->grad, n_embd/n_head, n_head, N, n_batch);
t05->grad = expand(gb, ggml_reshape_2d(ctx0, t06->grad, n_embd, N*n_batch)); assert_shape_2d(t05->grad, n_embd, N*n_batch);
t04->grad = expand(gb, ggml_add_inplace(ctx0,
ggml_add_inplace(ctx0,
@ -1961,7 +1964,7 @@ void print_matrix(struct ggml_tensor * probs) {
void print_token(struct llama_context * ctx, llama_token token) {
printf("%s", llama_token_to_str(ctx, token));
printf("%s", llama_token_to_str(ctx, token).c_str());
}
void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) {
@ -1995,7 +1998,7 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens)
}
}
void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
int n_tokens = tokens_input->ne[0];
int n_vocab = target_logits->ne[0];
@ -2004,7 +2007,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
ggml_set_f32(target_logits, -1.0f/n_vocab);
ggml_set_f32(target_probs, 0.0f);
ggml_set_i32_1d(tokens_input, 0, llama_token_bos());
ggml_set_i32_1d(tokens_input, 0, llama_token_bos(lctx));
for (int i=1; i<n_tokens+1; ++i) {
int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
set_f32_2d(target_logits, token, i-1, +1.0f);
@ -2015,7 +2018,7 @@ void get_example_targets(const int * train_samples, size_t n_train_samples, cons
}
}
void get_example_targets_batch(struct llama_context * /*lctx*/, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
GGML_ASSERT(tokens_input->n_dims == 2);
GGML_ASSERT(target_logits->n_dims == 3);
GGML_ASSERT(target_probs->n_dims == 3);
@ -2035,7 +2038,7 @@ void get_example_targets_batch(struct llama_context * /*lctx*/, const int * trai
size_t sample = train_samples[(example_id*n_batch + k) % n_train_samples];
GGML_ASSERT(sample+n_tokens-1 < n_train_data);
set_i32_2d(tokens_input, 0, k, llama_token_bos());
set_i32_2d(tokens_input, 0, k, llama_token_bos(lctx));
for (int i=1; i<n_tokens+1; ++i) {
int token = clamp(train_data[sample+i-1], 0, n_vocab-1);
// print_token(lctx, token);
@ -2188,11 +2191,10 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
f.read_raw(buf.data(), f.size);
buf[f.size] = '\0';
out.resize(buf.size());
int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false);
if (n_tokens >= 0) {
out.resize(n_tokens);
int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
if (n_tokens < 0) {
out.resize(-n_tokens);
llama_tokenize(lctx, buf.data(), out.data(), out.size(), false);
}
bool verify = false;
@ -2200,17 +2202,17 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
const char * in = buf.data();
const char * end = buf.data() + buf.size();
for (int i = 0; i < (int) out.size(); ++i) {
const char * s = llama_token_to_str(lctx, out[i]);
int len = strlen(s);
std::string s = llama_token_to_str(lctx, out[i]);
int len = s.length();
if (in >= end) {
printf("%s: unexpected end of original text.\n", __func__);
break;
}
const bool matches = (strncmp(in, s, len) == 0);
const bool matches = (strncmp(in, s.c_str(), len) == 0);
if (matches) {
in += len;
} else {
printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s);
printf("%s: mismatch: expected '%s', but got '%s'\n", __func__, std::string(in, len).c_str(), s.c_str());
}
}
}
@ -2294,7 +2296,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
const auto params = sampler->params;
// Apply penalties
const float nl_logit = logits[llama_token_nl()];
const float nl_logit = logits[llama_token_nl(ctx)];
const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx);
@ -2313,7 +2315,7 @@ llama_token sample(struct my_llama_sampler * sampler, float * logits, const llam
params.alpha_presence);
if (!params.penalize_nl) {
logits[llama_token_nl()] = nl_logit;
logits[llama_token_nl(ctx)] = nl_logit;
}
llama_token token = 0;
@ -2612,42 +2614,45 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
return;
}
// write_magic
file.write_u32(LLAMA_FILE_MAGIC); // magic
file.write_u32(LLAMA_FILE_VERSION); // version
// write_hparams
file.write_u32(model->hparams.n_vocab);
file.write_u32(model->hparams.n_embd);
file.write_u32(model->hparams.n_mult);
file.write_u32(model->hparams.n_head);
file.write_u32(model->hparams.n_layer);
file.write_u32(model->hparams.n_rot);
file.write_u32(LLAMA_FTYPE_ALL_F32);
// write_vocab
uint32_t n_vocab = model->hparams.n_vocab;
for (uint32_t i = 0; i < n_vocab; i++) {
const auto & token_score = vocab->id_to_token.at(i);
file.write_u32((uint32_t) token_score.tok.size());
file.write_raw(token_score.tok.data(), token_score.tok.size());
file.write_raw(&token_score.score, sizeof(token_score.score));
}
// write tensors
write_tensor(&file, model->tok_embeddings);
write_tensor(&file, model->norm);
write_tensor(&file, model->output);
for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
auto & layer = model->layers[i];
write_tensor(&file, layer.attention_norm);
write_tensor(&file, layer.wq);
write_tensor(&file, layer.wk);
write_tensor(&file, layer.wv);
write_tensor(&file, layer.wo);
write_tensor(&file, layer.ffn_norm);
write_tensor(&file, layer.w1);
write_tensor(&file, layer.w2);
write_tensor(&file, layer.w3);
}
#pragma message("TODO: implement file saving using gguf")
(void) vocab;
(void) model;
// // write_magic
// file.write_u32(LLAMA_FILE_MAGIC); // magic
// file.write_u32(LLAMA_FILE_VERSION); // version
// // write_hparams
// file.write_u32(model->hparams.n_vocab);
// file.write_u32(model->hparams.n_embd);
// file.write_u32(model->hparams.n_mult);
// file.write_u32(model->hparams.n_head);
// file.write_u32(model->hparams.n_layer);
// file.write_u32(model->hparams.n_rot);
// file.write_u32(LLAMA_FTYPE_ALL_F32);
// // write_vocab
// uint32_t n_vocab = model->hparams.n_vocab;
// for (uint32_t i = 0; i < n_vocab; i++) {
// const auto & token_data = vocab->id_to_token.at(i);
// file.write_u32((uint32_t) token_data.tok.size());
// file.write_raw(token_data.tok.data(), token_data.tok.size());
// file.write_raw(&token_data.score, sizeof(token_data.score));
// }
// // write tensors
// write_tensor(&file, model->tok_embeddings);
// write_tensor(&file, model->norm);
// write_tensor(&file, model->output);
// for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
// auto & layer = model->layers[i];
//
// write_tensor(&file, layer.attention_norm);
// write_tensor(&file, layer.wq);
// write_tensor(&file, layer.wk);
// write_tensor(&file, layer.wv);
// write_tensor(&file, layer.wo);
// write_tensor(&file, layer.ffn_norm);
// write_tensor(&file, layer.w1);
// write_tensor(&file, layer.w2);
// write_tensor(&file, layer.w3);
// }
}
float cosine_decay(const int decay_steps, const float alpha, int step) {
@ -3052,20 +3057,13 @@ int main(int argc, char ** argv) {
struct llama_vocab vocab;
{
std::vector<const char *> strings;
std::vector<float> scores;
int n_vocab = llama_n_vocab(lctx);
strings.resize(n_vocab, NULL);
scores.resize(n_vocab, 0);
n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
const int n_vocab = llama_n_vocab(lctx);
vocab.id_to_token.resize(n_vocab);
for (int i=0; i<n_vocab; ++i) {
std::string tok = std::string(strings[i]);
float score = scores[i];
vocab.id_to_token[i].tok = tok;
vocab.id_to_token[i].score = score;
vocab.token_to_id.emplace(tok, i);
vocab.id_to_token[i].text = llama_token_get_text(lctx, i);
vocab.id_to_token[i].score = llama_token_get_score(lctx, i);
vocab.id_to_token[i].type = llama_token_get_type(lctx, i);
vocab.token_to_id.emplace(vocab.id_to_token[i].text, i);
}
}
@ -3178,7 +3176,7 @@ int main(int argc, char ** argv) {
std::vector<int> train_samples;
train_samples.push_back(0);
for (int i = 1; i < (int) train_tokens.size() - n_tokens; ++i) {
if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl())) {
if (!params.samples_start_after_nl || (train_tokens[i-1] == llama_token_nl(lctx))) {
train_samples.push_back(i);
}
}
@ -3338,7 +3336,7 @@ int main(int argc, char ** argv) {
struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
struct ggml_tensor * target_probs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens);
get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs);
for (int i=sample_ctx; i<n_tokens; ++i) {
ggml_set_i32_1d(tokens_input, i, n_vocab/2);
}