Import llama.cpp

https://github.com/ggerganov/llama.cpp
0b2da20538d01926b77ea237dd1c930c4d20b686
See third_party/ggml/README.cosmo for changes
This commit is contained in:
Justine Tunney 2023-04-27 14:31:20 -07:00
parent f42089d5c6
commit e8b43903b2
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
14 changed files with 18313 additions and 2 deletions

View file

@ -144,6 +144,7 @@ include libc/stdio/stdio.mk # │
include third_party/libcxx/libcxx.mk # │
include net/net.mk # │
include third_party/vqsort/vqsort.mk # │
include third_party/ggml/ggml.mk # │
include libc/log/log.mk # │
include third_party/bzip2/bzip2.mk # │
include dsp/core/core.mk # │

21
third_party/ggml/LICENSE vendored Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 Georgi Gerganov
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

21
third_party/ggml/README.cosmo vendored Normal file
View file

@ -0,0 +1,21 @@
DESCRIPTION
ggml is a machine learning library useful for LLM inference on CPUs
LICENSE
MIT
ORIGIN
https://github.com/ggerganov/llama.cpp
commit 0b2da20538d01926b77ea237dd1c930c4d20b686
Author: Stephan Walter <stephan@walter.name>
Date: Wed Apr 26 20:26:42 2023 +0000
ggml : slightly faster AVX2 implementation for Q5 (#1197)
LOCAL CHANGES
- Refactor headers per cosmo convention
- Replace code like 'ggjt' with READ32BE("ggjt")
- Remove C++ exceptions; use Die() function instead

385
third_party/ggml/common.cc vendored Normal file
View file

@ -0,0 +1,385 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
llama.cpp
Copyright (c) 2023 Georgi Gerganov
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
asm(".ident\t\"\\n\\n\
llama.cpp (MIT License)\\n\
Copyright (c) 2023 Georgi Gerganov\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
#include "third_party/ggml/common.h"
#include "third_party/libcxx/cassert"
#include "third_party/libcxx/cstring"
#include "third_party/libcxx/fstream"
#include "third_party/libcxx/string"
#include "third_party/libcxx/iterator"
#include "third_party/libcxx/algorithm"
#if defined (_WIN32)
#include "libc/calls/calls.h"
#include "libc/calls/struct/flock.h"
#include "libc/calls/weirdtypes.h"
#include "libc/sysv/consts/at.h"
#include "libc/sysv/consts/f.h"
#include "libc/sysv/consts/fd.h"
#include "libc/sysv/consts/o.h"
#include "libc/sysv/consts/posix.h"
#include "libc/sysv/consts/s.h"
// MISSING #include <io.h>
#pragma comment(lib,"kernel32.lib")
extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
const wchar_t * lpWideCharStr, int cchWideChar,
char * lpMultiByteStr, int cbMultiByte,
const char * lpDefaultChar, bool * lpUsedDefaultChar);
#define CP_UTF8 65001
#endif
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
// determine sensible default number of threads.
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
#ifdef __linux__
std::ifstream cpuinfo("/proc/cpuinfo");
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
std::istream_iterator<std::string>(),
std::string("processor"));
#endif
if (params.n_threads == 0) {
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
}
bool invalid_param = false;
std::string arg;
gpt_params default_params;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-s" || arg == "--seed") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.seed = std::stoi(argv[i]);
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_threads = std::stoi(argv[i]);
} else if (arg == "-p" || arg == "--prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.prompt = argv[i];
} else if (arg == "-f" || arg == "--file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
params.prompt.pop_back();
}
} else if (arg == "-n" || arg == "--n_predict") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_predict = std::stoi(argv[i]);
} else if (arg == "--top_k") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.top_k = std::stoi(argv[i]);
} else if (arg == "-c" || arg == "--ctx_size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_ctx = std::stoi(argv[i]);
} else if (arg == "--memory_f32") {
params.memory_f16 = false;
} else if (arg == "--top_p") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.top_p = std::stof(argv[i]);
} else if (arg == "--temp") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.temp = std::stof(argv[i]);
} else if (arg == "--repeat_last_n") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.repeat_last_n = std::stoi(argv[i]);
} else if (arg == "--repeat_penalty") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.repeat_penalty = std::stof(argv[i]);
} else if (arg == "-b" || arg == "--batch_size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_batch = std::stoi(argv[i]);
params.n_batch = std::min(512, params.n_batch);
} else if (arg == "--keep") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_keep = std::stoi(argv[i]);
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model = argv[i];
} else if (arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_adapter = argv[i];
params.use_mmap = false;
} else if (arg == "--lora-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_base = argv[i];
} else if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
} else if (arg == "--embedding") {
params.embedding = true;
} else if (arg == "--interactive-first") {
params.interactive_first = true;
} else if (arg == "-ins" || arg == "--instruct") {
params.instruct = true;
} else if (arg == "--color") {
params.use_color = true;
} else if (arg == "--mlock") {
params.use_mlock = true;
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.antiprompt.push_back(argv[i]);
} else if (arg == "--perplexity") {
params.perplexity = true;
} else if (arg == "--ignore-eos") {
params.ignore_eos = true;
} else if (arg == "--n_parts") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_parts = std::stoi(argv[i]);
} else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, default_params);
exit(0);
} else if (arg == "--random-prompt") {
params.random_prompt = true;
} else if (arg == "--in-prefix") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.input_prefix = argv[i];
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params);
exit(1);
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, default_params);
exit(1);
}
return true;
}
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -i, --interactive run in interactive mode\n");
fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
fprintf(stderr, " specified more than once for multiple prompts).\n");
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: empty)\n");
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n");
fprintf(stderr, " prompt file to start generation.\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", (double)params.top_p);
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
if (llama_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
if (llama_mmap_supported()) {
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
}
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
}
std::string gpt_random_prompt(std::mt19937 & rng) {
const int r = rng() % 10;
switch (r) {
case 0: return "So";
case 1: return "Once upon a time";
case 2: return "When";
case 3: return "The";
case 4: return "After";
case 5: return "If";
case 6: return "import";
case 7: return "He";
case 8: return "She";
case 9: return "They";
default: return "To";
}
return "The";
}
// TODO: not great allocating this every time
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
std::vector<llama_token> res(text.size() + (int)add_bos);
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
assert(n >= 0);
res.resize(n);
return res;
}
/* Keep track of current color of output, and emit ANSI code if it changes. */
void set_console_color(console_state & con_st, console_color_t color) {
if (con_st.use_color && con_st.color != color) {
switch(color) {
case CONSOLE_COLOR_DEFAULT:
printf(ANSI_COLOR_RESET);
break;
case CONSOLE_COLOR_PROMPT:
printf(ANSI_COLOR_YELLOW);
break;
case CONSOLE_COLOR_USER_INPUT:
printf(ANSI_BOLD ANSI_COLOR_GREEN);
break;
}
con_st.color = color;
}
}
#if defined (_WIN32)
void win32_console_init(bool enable_color) {
unsigned long dwMode = 0;
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) {
hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12)
if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) {
hConOut = 0;
}
}
if (hConOut) {
// Enable ANSI colors on Windows 10+
if (enable_color && !(dwMode & 0x4)) {
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
}
// Set console output codepage to UTF8
SetConsoleOutputCP(CP_UTF8);
}
void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
// Set console input codepage to UTF16
_setmode(_fileno(stdin), _O_WTEXT);
}
}
// Convert a wide Unicode string to an UTF8 string
void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
std::string strTo(size_needed, 0);
WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
str = strTo;
}
#endif

103
third_party/ggml/common.h vendored Normal file
View file

@ -0,0 +1,103 @@
// -*- c++ -*-
// clang-format off
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
#define COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
#include "third_party/ggml/llama.h"
#include "third_party/libcxx/string"
#include "third_party/libcxx/vector"
#include "third_party/libcxx/random"
#include "third_party/libcxx/thread"
#if !(__ASSEMBLER__ + __LINKER__ + 0)
// clang-format off
// Various helper functions and utilities
//
// CLI argument parsing
//
struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_predict = 128; // new tokens to predict
int32_t repeat_last_n = 64; // last n tokens to penalize
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
// sampling parameters
int32_t top_k = 40;
float top_p = 0.95f;
float temp = 0.80f;
float repeat_penalty = 1.10f;
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string prompt = "";
std::string input_prefix = ""; // string to prefix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string lora_adapter = ""; // lora adapter path
std::string lora_base = ""; // base model path for the lora adapter
bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool embedding = false; // get only sentence embedding
bool interactive_first = false; // wait for user input immediately
bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos
bool perplexity = false; // compute perplexity over the prompt
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool verbose_prompt = false; // print prompt tokens before generation
};
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
//
// Vocab utils
//
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
//
// Console utils
//
#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m"
#define ANSI_COLOR_BLUE "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN "\x1b[36m"
#define ANSI_COLOR_RESET "\x1b[0m"
#define ANSI_BOLD "\x1b[1m"
enum console_color_t {
CONSOLE_COLOR_DEFAULT=0,
CONSOLE_COLOR_PROMPT,
CONSOLE_COLOR_USER_INPUT
};
struct console_state {
bool use_color = false;
console_color_t color = CONSOLE_COLOR_DEFAULT;
};
void set_console_color(console_state & con_st, console_color_t color);
#if defined (_WIN32)
void win32_console_init(bool enable_color);
void win32_utf8_encode(const std::wstring & wstr, std::string & str);
#endif
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_ */

13137
third_party/ggml/ggml.c vendored Normal file

File diff suppressed because it is too large Load diff

889
third_party/ggml/ggml.h vendored Normal file
View file

@ -0,0 +1,889 @@
// clang-format off
#ifndef COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_
#define COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
// clang-format off
//
// GGML Tensor Library
//
// This documentation is still a work in progress.
// If you wish some specific topics to be covered, feel free to drop a comment:
//
// https://github.com/ggerganov/whisper.cpp/issues/40
//
// ## Overview
//
// This library implements:
//
// - a set of tensor operations
// - automatic differentiation
// - basic optimization algorithms
//
// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
// but is not limited to, the following:
//
// - linear regression
// - support vector machines
// - neural networks
//
// The library allows the user to define a certain function using the available tensor operations. This function
// definition is represented internally via a computation graph. Each tensor operation in the function definition
// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
// using one of the available optimization algorithms.
//
// For example, here we define the function: f(x) = a*x^2 + b
//
// {
// struct ggml_init_params params = {
// .mem_size = 16*1024*1024,
// .mem_buffer = NULL,
// };
//
// // memory allocation happens here
// struct ggml_context * ctx = ggml_init(params);
//
// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
//
// ggml_set_param(ctx, x); // x is an input variable
//
// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
// struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
//
// ...
// }
//
// Notice that the function definition above does not involve any actual computation. The computation is performed only
// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
//
// {
// ...
//
// struct ggml_cgraph gf = ggml_build_forward(f);
//
// // set the input variable and parameter values
// ggml_set_f32(x, 2.0f);
// ggml_set_f32(a, 3.0f);
// ggml_set_f32(b, 4.0f);
//
// ggml_graph_compute(ctx0, &gf);
//
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
//
// ...
// }
//
// The actual computation is performed in the ggml_graph_compute() function.
//
// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
// actually needed.
//
// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
// differentiation and optimization algorithms.
//
// The described approach allows to define the function graph once and then compute its forward or backward graphs
// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
// the user can avoid the memory allocation overhead at runtime.
//
// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
// citizens, but in theory the library can be extended to support FP8 and integer data types.
//
// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
// clear that the library needs to support more complex operations. The way to support these operations is not clear
// yet, but a few examples are demonstrated in the following operations:
//
// - ggml_permute()
// - ggml_conv_1d_1s()
// - ggml_conv_1d_2s()
//
// For each tensor operator, the library implements a forward and backward computation function. The forward function
// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
// calculus class, or watch the following video:
//
// What is Automatic Differentiation?
// https://www.youtube.com/watch?v=wG_nF1awSSY
//
//
// ## Tensor data (struct ggml_tensor)
//
// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
//
// {
// struct ggml_tensor * c = ggml_add(ctx, a, b);
//
// assert(c->src[0] == a);
// assert(c->src[1] == b);
// }
//
// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
// contiguous in memory.
//
// The data of the tensor is accessed via the "data" pointer. For example:
//
// {
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
//
// // a[1, 2] = 1.0f;
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
//
// // a[2, 0] = 2.0f;
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
//
// ...
// }
//
// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
//
// ## The matrix multiplication operator (ggml_mul_mat)
//
// TODO
//
//
// ## Multi-threading
//
// TODO
//
//
// ## Overview of ggml.c
//
// TODO
//
//
// ## SIMD optimizations
//
// TODO
//
//
// ## Debugging ggml
//
// TODO
//
//
#ifdef GGML_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef GGML_BUILD
# define GGML_API __declspec(dllexport)
# else
# define GGML_API __declspec(dllimport)
# endif
# else
# define GGML_API __attribute__ ((visibility ("default")))
# endif
#else
# define GGML_API
#endif
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 1
#define GGML_MAX_DIMS 4
#define GGML_MAX_NODES 4096
#define GGML_MAX_PARAMS 16
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_OPT 4
#define GGML_DEFAULT_N_THREADS 4
#ifdef __ARM_NEON
// we use the built-in 16-bit float type
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;
#endif
// convert FP16 <-> FP32
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
struct ggml_object;
struct ggml_context;
enum ggml_type {
GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
GGML_TYPE_Q4_2 = 4,
GGML_TYPE_Q4_3 = 5,
GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
GGML_TYPE_COUNT,
};
// available tensor operations:
enum ggml_op {
GGML_OP_NONE = 0,
GGML_OP_DUP,
GGML_OP_ADD,
GGML_OP_SUB,
GGML_OP_MUL,
GGML_OP_DIV,
GGML_OP_SQR,
GGML_OP_SQRT,
GGML_OP_SUM,
GGML_OP_MEAN,
GGML_OP_REPEAT,
GGML_OP_ABS,
GGML_OP_SGN,
GGML_OP_NEG,
GGML_OP_STEP,
GGML_OP_RELU,
GGML_OP_GELU,
GGML_OP_SILU,
GGML_OP_NORM, // normalize
GGML_OP_RMS_NORM,
GGML_OP_MUL_MAT,
GGML_OP_SCALE,
GGML_OP_CPY,
GGML_OP_CONT,
GGML_OP_RESHAPE,
GGML_OP_VIEW,
GGML_OP_PERMUTE,
GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS,
GGML_OP_DIAG_MASK_INF,
GGML_OP_SOFT_MAX,
GGML_OP_ROPE,
GGML_OP_CONV_1D_1S,
GGML_OP_CONV_1D_2S,
GGML_OP_FLASH_ATTN,
GGML_OP_FLASH_FF,
GGML_OP_MAP_UNARY,
GGML_OP_MAP_BINARY,
GGML_OP_COUNT,
};
// ggml object
struct ggml_object {
size_t offs;
size_t size;
struct ggml_object * next;
char padding[8];
};
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
// n-dimensional tensor
struct ggml_tensor {
enum ggml_type type;
int n_dims;
int64_t ne[GGML_MAX_DIMS]; // number of elements
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
// nb[0] = sizeof(type)
// nb[1] = nb[0] * ne[0] + padding
// nb[i] = nb[i-1] * ne[i-1]
// compute data
enum ggml_op op;
bool is_param;
struct ggml_tensor * grad;
struct ggml_tensor * src0;
struct ggml_tensor * src1;
struct ggml_tensor * opt[GGML_MAX_OPT];
// thread scheduling
int n_tasks;
// performance
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
void * data;
char padding[8];
};
// computation graph
struct ggml_cgraph {
int n_nodes;
int n_leafs;
int n_threads;
size_t work_size;
struct ggml_tensor * work;
struct ggml_tensor * nodes[GGML_MAX_NODES];
struct ggml_tensor * grads[GGML_MAX_NODES];
struct ggml_tensor * leafs[GGML_MAX_NODES];
// performance
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
};
// scratch buffer
struct ggml_scratch {
size_t offs;
size_t size;
void * data;
};
struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally
bool no_alloc; // don't allocate memory for the tensor data
};
// misc
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
GGML_API int64_t ggml_time_ms(void);
GGML_API int64_t ggml_time_us(void);
GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void);
GGML_API void ggml_print_object (const struct ggml_object * obj);
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API int ggml_blck_size (enum ggml_type type);
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
GGML_API const char * ggml_type_name(enum ggml_type type);
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_quantized(enum ggml_type type);
// main
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
GGML_API void ggml_free(struct ggml_context * ctx);
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
int n_dims,
const int64_t *ne);
GGML_API struct ggml_tensor * ggml_new_tensor_1d(
struct ggml_context * ctx,
enum ggml_type type,
int64_t ne0);
GGML_API struct ggml_tensor * ggml_new_tensor_2d(
struct ggml_context * ctx,
enum ggml_type type,
int64_t ne0,
int64_t ne1);
GGML_API struct ggml_tensor * ggml_new_tensor_3d(
struct ggml_context * ctx,
enum ggml_type type,
int64_t ne0,
int64_t ne1,
int64_t ne2);
GGML_API struct ggml_tensor * ggml_new_tensor_4d(
struct ggml_context * ctx,
enum ggml_type type,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int64_t ne3);
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
//
// operations on tensors with backpropagation
//
GGML_API struct ggml_tensor * ggml_dup(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_add(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_add_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_sub(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_mul(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_div(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_sqr(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_sqrt(
struct ggml_context * ctx,
struct ggml_tensor * a);
// return scalar
// TODO: compute sum along rows
GGML_API struct ggml_tensor * ggml_sum(
struct ggml_context * ctx,
struct ggml_tensor * a);
// mean along rows
GGML_API struct ggml_tensor * ggml_mean(
struct ggml_context * ctx,
struct ggml_tensor * a);
// if a is the same shape as b, and a is not parameter, return a
// otherwise, return a new tensor: repeat(a) to fit in b
GGML_API struct ggml_tensor * ggml_repeat(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_abs(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_sgn(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_neg(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_step(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_relu(
struct ggml_context * ctx,
struct ggml_tensor * a);
// TODO: double-check this computation is correct
GGML_API struct ggml_tensor * ggml_gelu(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_silu(
struct ggml_context * ctx,
struct ggml_tensor * a);
// normalize along rows
// TODO: eps is hardcoded to 1e-5 for now
GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_rms_norm(
struct ggml_context * ctx,
struct ggml_tensor * a);
// A: m rows, n columns
// B: p rows, n columns (i.e. we transpose it internally)
// result is m columns, p rows
GGML_API struct ggml_tensor * ggml_mul_mat(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
//
// operations on tensors without backpropagation
//
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_scale(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// a -> b, return view(b)
GGML_API struct ggml_tensor * ggml_cpy(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// make contiguous
GGML_API struct ggml_tensor * ggml_cont(
struct ggml_context * ctx,
struct ggml_tensor * a);
// return view(a), b specifies the new shape
// TODO: when we start computing gradient, make a copy instead of view
GGML_API struct ggml_tensor * ggml_reshape(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// return view(a)
// TODO: when we start computing gradient, make a copy instead of view
GGML_API struct ggml_tensor * ggml_reshape_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1);
// return view(a)
// TODO: when we start computing gradient, make a copy instead of view
GGML_API struct ggml_tensor * ggml_reshape_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2);
// offset in bytes
GGML_API struct ggml_tensor * ggml_view_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
size_t offset);
GGML_API struct ggml_tensor * ggml_view_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
size_t nb1, // row stride in bytes
size_t offset);
GGML_API struct ggml_tensor * ggml_view_3d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
size_t nb1, // row stride in bytes
size_t nb2, // slice stride in bytes
size_t offset);
GGML_API struct ggml_tensor * ggml_permute(
struct ggml_context * ctx,
struct ggml_tensor * a,
int axis0,
int axis1,
int axis2,
int axis3);
// alias for ggml_permute(ctx, a, 1, 0, 2, 3)
GGML_API struct ggml_tensor * ggml_transpose(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_get_rows(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// set elements above the diagonal to -INF
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_soft_max(
struct ggml_context * ctx,
struct ggml_tensor * a);
// rotary position embedding
// in-place, returns view(a)
// if mode & 1 == 1, skip n_past elements
// if mode & 2 == 1, GPT-NeoX style
// TODO: avoid creating a new tensor every time
GGML_API struct ggml_tensor * ggml_rope(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode);
// padding = 1
// TODO: we don't support extra parameters for now
// that's why we are hard-coding the stride, padding, and dilation
// not great ..
GGML_API struct ggml_tensor * ggml_conv_1d_1s(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_conv_1d_2s(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_flash_attn(
struct ggml_context * ctx,
struct ggml_tensor * q,
struct ggml_tensor * k,
struct ggml_tensor * v,
bool masked);
GGML_API struct ggml_tensor * ggml_flash_ff(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b0,
struct ggml_tensor * b1,
struct ggml_tensor * c0,
struct ggml_tensor * c1);
// Mapping operations
GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
GGML_API struct ggml_tensor * ggml_map_unary_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
const ggml_unary_op_f32_t fun);
GGML_API struct ggml_tensor * ggml_map_binary_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
const ggml_binary_op_f32_t fun);
//
// automatic differentiation
//
GGML_API void ggml_set_param(
struct ggml_context * ctx,
struct ggml_tensor * tensor);
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
// print info and performance information for the graph
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
// dump the graph into a file using the dot format
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
//
// optimization
//
// optimization methods
enum ggml_opt_type {
GGML_OPT_ADAM,
GGML_OPT_LBFGS,
};
// linesearch methods
enum ggml_linesearch {
GGML_LINESEARCH_DEFAULT = 1,
GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
};
// optimization return values
enum ggml_opt_result {
GGML_OPT_OK = 0,
GGML_OPT_DID_NOT_CONVERGE,
GGML_OPT_NO_CONTEXT,
GGML_OPT_INVALID_WOLFE,
GGML_OPT_FAIL,
GGML_LINESEARCH_FAIL = -128,
GGML_LINESEARCH_MINIMUM_STEP,
GGML_LINESEARCH_MAXIMUM_STEP,
GGML_LINESEARCH_MAXIMUM_ITERATIONS,
GGML_LINESEARCH_INVALID_PARAMETERS,
};
// optimization parameters
//
// see ggml.c (ggml_opt_default_params) for default values
//
struct ggml_opt_params {
enum ggml_opt_type type;
int n_threads;
// delta-based convergence test
//
// if past == 0 - disabled
// if past > 0:
// stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
//
int past;
float delta;
// maximum number of iterations without improvement
//
// if 0 - disabled
// if > 0:
// assume convergence if no cost improvement in this number of iterations
//
int max_no_improvement;
bool print_forward_graph;
bool print_backward_graph;
// ADAM parameters
struct {
int n_iter;
float alpha; // learning rate
float beta1;
float beta2;
float eps; // epsilon for numerical stability
float eps_f; // epsilon for convergence test
float eps_g; // epsilon for convergence test
} adam;
// LBFGS parameters
struct {
int m; // number of corrections to approximate the inv. Hessian
int n_iter;
int max_linesearch;
float eps; // convergence tolerance
float ftol; // line search tolerance
float wolfe;
float min_step;
float max_step;
enum ggml_linesearch linesearch;
} lbfgs;
};
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
// optimize the function defined by the tensor f
GGML_API enum ggml_opt_result ggml_opt(
struct ggml_context * ctx,
struct ggml_opt_params params,
struct ggml_tensor * f);
//
// quantization
//
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
//
// system info
//
GGML_API int ggml_cpu_has_avx (void);
GGML_API int ggml_cpu_has_avx2 (void);
GGML_API int ggml_cpu_has_avx512 (void);
GGML_API int ggml_cpu_has_avx512_vbmi(void);
GGML_API int ggml_cpu_has_avx512_vnni(void);
GGML_API int ggml_cpu_has_fma (void);
GGML_API int ggml_cpu_has_neon (void);
GGML_API int ggml_cpu_has_arm_fma (void);
GGML_API int ggml_cpu_has_f16c (void);
GGML_API int ggml_cpu_has_fp16_va (void);
GGML_API int ggml_cpu_has_wasm_simd (void);
GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cublas (void);
GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_vsx (void);
//
// Internal types and functions exposed for tests and benchmarks
//
#ifdef __cplusplus
// restrict not standard in C++
#define GGML_RESTRICT
#else
#define GGML_RESTRICT restrict
#endif
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
typedef struct {
dequantize_row_q_t dequantize_row_q;
quantize_row_q_t quantize_row_q;
quantize_row_q_t quantize_row_q_reference;
quantize_row_q_t quantize_row_q_dot;
vec_dot_q_t vec_dot_q;
enum ggml_type vec_dot_type;
} quantize_fns_t;
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_ */

114
third_party/ggml/ggml.mk vendored Normal file
View file

@ -0,0 +1,114 @@
#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘
PKGS += THIRD_PARTY_GGML
################################################################################
# single file machine learning framework written in c
# make -j8 o//third_party/ggml/ggml.a
THIRD_PARTY_GGML_ARTIFACTS += THIRD_PARTY_GGML_A
THIRD_PARTY_GGML = $(THIRD_PARTY_GGML_A_DEPS) $(THIRD_PARTY_GGML_A)
THIRD_PARTY_GGML_A = o/$(MODE)/third_party/ggml/ggml.a
THIRD_PARTY_GGML_A_HDRS = third_party/ggml/ggml.h
THIRD_PARTY_GGML_A_SRCS = third_party/ggml/ggml.c
THIRD_PARTY_GGML_A_OBJS = $(THIRD_PARTY_GGML_A_SRCS:%.c=o/$(MODE)/%.o)
THIRD_PARTY_GGML_A_FILES = $(THIRD_PARTY_GGML_A_SRCS) $(THIRD_PARTY_GGML_A_HDRS)
THIRD_PARTY_GGML_A_CHECKS = $(THIRD_PARTY_GGML_A).pkg $(THIRD_PARTY_GGML_A_HDRS:%=o/$(MODE)/%.ok)
THIRD_PARTY_GGML_A_DIRECTDEPS = \
LIBC_CALLS \
LIBC_INTRIN \
LIBC_MEM \
LIBC_NEXGEN32E \
LIBC_RUNTIME \
LIBC_STDIO \
LIBC_THREAD \
LIBC_STR \
LIBC_STUBS \
LIBC_SYSV \
LIBC_TINYMATH
THIRD_PARTY_GGML_A_DEPS := \
$(call uniq,$(foreach x,$(THIRD_PARTY_GGML_A_DIRECTDEPS),$($(x))))
$(THIRD_PARTY_GGML_A): \
third_party/ggml/ \
$(THIRD_PARTY_GGML_A).pkg \
$(THIRD_PARTY_GGML_A_OBJS)
$(THIRD_PARTY_GGML_A).pkg: \
$(THIRD_PARTY_GGML_A_OBJS) \
$(foreach x,$(THIRD_PARTY_GGML_A_DIRECTDEPS),$($(x)_A).pkg)
$(THIRD_PARTY_GGML_A_OBJS): private \
OVERRIDE_CFLAGS += \
-O3 \
-ffunction-sections \
-fdata-sections \
-msse3 \
-mavx \
-mavx2 \
-mf16c \
-mfma
################################################################################
# command for running inference on large language models
# make -j8 o//third_party/ggml/llama.com
THIRD_PARTY_GGML_ARTIFACTS += THIRD_PARTY_GGML_LLAMA
THIRD_PARTY_GGML_LLAMA = o/$(MODE)/third_party/ggml/llama.com
THIRD_PARTY_GGML_LLAMA_HDRS = third_party/ggml/llama.h third_party/ggml/llama_util.h third_party/ggml/common.h
THIRD_PARTY_GGML_LLAMA_SRCS = third_party/ggml/main.cc third_party/ggml/llama.cc third_party/ggml/common.cc
THIRD_PARTY_GGML_LLAMA_OBJS = $(THIRD_PARTY_GGML_LLAMA_SRCS:%.cc=o/$(MODE)/%.o)
THIRD_PARTY_GGML_LLAMA_FILES := $(THIRD_PARTY_GGML_LLAMA_SRCS) $(THIRD_PARTY_GGML_LLAMA_HDRS)
THIRD_PARTY_GGML_LLAMA_CHECKS = $(THIRD_PARTY_GGML_LLAMA).pkg $(THIRD_PARTY_GGML_LLAMA_HDRS:%=o/$(MODE)/%.okk)
THIRD_PARTY_GGML_LLAMA_DIRECTDEPS = \
LIBC_CALLS \
LIBC_FMT \
LIBC_INTRIN \
LIBC_MEM \
LIBC_NEXGEN32E \
LIBC_RUNTIME \
LIBC_STDIO \
LIBC_STR \
LIBC_STUBS \
LIBC_SYSV \
LIBC_THREAD \
LIBC_TINYMATH \
THIRD_PARTY_GGML \
THIRD_PARTY_LIBCXX
THIRD_PARTY_GGML_LLAMA_DEPS := \
$(call uniq,$(foreach x,$(THIRD_PARTY_GGML_LLAMA_DIRECTDEPS),$($(x))))
$(THIRD_PARTY_GGML_LLAMA).dbg: \
$(THIRD_PARTY_GGML_LLAMA).pkg \
$(THIRD_PARTY_GGML_LLAMA_DEPS) \
o/$(MODE)/third_party/ggml/common.o \
o/$(MODE)/third_party/ggml/llama.o \
o/$(MODE)/third_party/ggml/main.o \
$(CRT) \
$(APE_NO_MODIFY_SELF)
@$(APELINK)
$(THIRD_PARTY_GGML_LLAMA).pkg: \
$(THIRD_PARTY_GGML_LLAMA_OBJS) \
$(foreach x,$(THIRD_PARTY_GGML_LLAMA_DIRECTDEPS),$($(x)_A).pkg)
################################################################################
THIRD_PARTY_GGML_COMS = $(THIRD_PARTY_GGML_LLAMA)
THIRD_PARTY_GGML_BINS = $(THIRD_PARTY_GGML_COMS) $(THIRD_PARTY_GGML_COMS:%=%.dbg)
THIRD_PARTY_GGML_LIBS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)))
THIRD_PARTY_GGML_SRCS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_SRCS))
THIRD_PARTY_GGML_HDRS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_HDRS))
THIRD_PARTY_GGML_OBJS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_OBJS))
THIRD_PARTY_GGML_CHECKS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_CHECKS))
$(THIRD_PARTY_GGML_OBJS): third_party/ggml/ggml.mk
.PHONY: o/$(MODE)/third_party/ggml
o/$(MODE)/third_party/ggml: \
$(THIRD_PARTY_GGML_BINS) \
$(THIRD_PARTY_GGML_CHECKS)

2472
third_party/ggml/llama.cc vendored Normal file

File diff suppressed because it is too large Load diff

211
third_party/ggml/llama.h vendored Normal file
View file

@ -0,0 +1,211 @@
// -*- c++ -*-
// clang-format off
#ifndef LLAMA_H
#define LLAMA_H
#ifdef LLAMA_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef LLAMA_BUILD
# define LLAMA_API __declspec(dllexport)
# else
# define LLAMA_API __declspec(dllimport)
# endif
# else
# define LLAMA_API __attribute__ ((visibility ("default")))
# endif
#else
# define LLAMA_API
#endif
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
#ifdef __cplusplus
extern "C" {
#endif
//
// C interface
//
// TODO: show sample usage
//
struct llama_context;
typedef int llama_token;
typedef struct llama_token_data {
llama_token id; // token id
float p; // probability of the token
float plog; // log probability of the token
} llama_token_data;
typedef void (*llama_progress_callback)(float progress, void *ctx);
struct llama_context_params {
int n_ctx; // text context
int n_parts; // -1 for default
int seed; // RNG seed, 0 for random
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM
bool embedding; // embedding mode only
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback;
// context pointer passed to the progress callback
void * progress_callback_user_data;
};
// model file types
enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0,
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
};
LLAMA_API struct llama_context_params llama_context_default_params();
LLAMA_API bool llama_mmap_supported();
LLAMA_API bool llama_mlock_supported();
// Various functions for loading a ggml llama model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure
LLAMA_API struct llama_context * llama_init_from_file(
const char * path_model,
struct llama_context_params params);
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
// TODO: not great API - very likely to change
// Returns 0 on success
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
enum llama_ftype ftype,
int nthread);
// Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for
// the layers modified by the adapter. Can be NULL to use the current loaded model.
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one
// Returns 0 on success
LLAMA_API int llama_apply_lora_from_file(
struct llama_context * ctx,
const char * path_lora,
const char * path_base_model,
int n_threads);
// Returns the number of tokens in the KV cache
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
// Sets the current rng seed.
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
// Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
// Copies the state to the specified destination address.
// Destination needs to have allocated enough memory.
// Returns the number of bytes copied
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
// Set the state reading from the specified address
// Returns the number of bytes read
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
// Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls
// Returns 0 on success
LLAMA_API int llama_eval(
struct llama_context * ctx,
const llama_token * tokens,
int n_tokens,
int n_past,
int n_threads);
// Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens
// Returns a negative number on failure - the number of tokens that would have been returned
// TODO: not sure if correct
LLAMA_API int llama_tokenize(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
LLAMA_API int llama_n_embd (struct llama_context * ctx);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token
// Rows: n_tokens
// Cols: n_vocab
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
// Get the embeddings for the input
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos();
LLAMA_API llama_token llama_token_eos();
// TODO: improve the last_n_tokens interface ?
LLAMA_API llama_token llama_sample_top_p_top_k(
struct llama_context * ctx,
const llama_token * last_n_tokens_data,
int last_n_tokens_size,
int top_k,
float top_p,
float temp,
float repeat_penalty);
// Performance information
LLAMA_API void llama_print_timings(struct llama_context * ctx);
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
// Print system information
LLAMA_API const char * llama_print_system_info(void);
#ifdef __cplusplus
}
#endif
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL
#include "third_party/libcxx/vector"
#include "third_party/libcxx/string"
struct ggml_tensor;
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
#endif
#endif // LLAMA_H

388
third_party/ggml/llama_util.h vendored Executable file
View file

@ -0,0 +1,388 @@
// Internal header to be included only by llama.cpp.
// Contains wrappers around OS interfaces.
#ifndef LLAMA_UTIL_H
#define LLAMA_UTIL_H
#include "libc/calls/struct/rlimit.h"
#include "libc/fmt/fmt.h"
#include "libc/runtime/sysconf.h"
#include "libc/sysv/consts/madv.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/prot.h"
#include "libc/sysv/consts/rlimit.h"
#include "third_party/libcxx/cerrno"
#include "third_party/libcxx/climits"
#include "third_party/libcxx/cstdarg"
#include "third_party/libcxx/cstdint"
#include "third_party/libcxx/cstdio"
#include "third_party/libcxx/cstdlib"
#include "third_party/libcxx/cstring"
#include "third_party/libcxx/string"
#include "third_party/libcxx/vector"
// clang-format off
#define LLAMA_ASSERT(x) \
do { \
if (!(x)) { \
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
abort(); \
} \
} while (0)
#ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((__format__(__gnu_printf__, 1, 2)))
#else
__attribute__((__format__(__printf__, 1, 2)))
#endif
__attribute__((__noreturn__))
#endif
static void Die(const char *fmt, ...) {
va_list va;
va_start(va, fmt);
vfprintf(stderr, fmt, va);
va_end(va);
fputc('\n', stderr);
exit(1);
}
struct llama_file {
// use FILE * so we don't have to re-open the file to mmap
FILE * fp;
size_t size;
llama_file(const char * fname, const char * mode) {
fp = std::fopen(fname, mode);
if (fp == NULL) {
Die("failed to open %s: %s", fname, std::strerror(errno));
}
seek(0, SEEK_END);
size = tell();
seek(0, SEEK_SET);
}
size_t tell() const {
#ifdef _WIN32
__int64 ret = _ftelli64(fp);
#else
long ret = std::ftell(fp);
#endif
LLAMA_ASSERT(ret != -1); // this really shouldn't fail
return (size_t) ret;
}
void seek(size_t offset, int whence) {
#ifdef _WIN32
int ret = _fseeki64(fp, (__int64) offset, whence);
#else
int ret = std::fseek(fp, (long) offset, whence);
#endif
LLAMA_ASSERT(ret == 0); // same
}
void read_raw(void * ptr, size_t size) {
if (size == 0) {
return;
}
errno = 0;
std::size_t ret = std::fread(ptr, size, 1, fp);
if (ferror(fp)) {
Die("read error: %s", strerror(errno));
}
if (ret != 1) {
Die("unexpectedly reached end of file");
}
}
std::uint32_t read_u32() {
std::uint32_t ret;
read_raw(&ret, sizeof(ret));
return ret;
}
std::string read_string(std::uint32_t len) {
std::vector<char> chars(len);
read_raw(chars.data(), len);
return std::string(chars.data(), len);
}
void write_raw(const void * ptr, size_t size) {
if (size == 0) {
return;
}
errno = 0;
size_t ret = std::fwrite(ptr, size, 1, fp);
if (ret != 1) {
Die("write error: %s", strerror(errno));
}
}
void write_u32(std::uint32_t val) {
write_raw(&val, sizeof(val));
}
~llama_file() {
if (fp) {
std::fclose(fp);
}
}
};
#if defined(_WIN32)
static std::string llama_format_win_err(DWORD err) {
LPSTR buf;
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
if (!size) {
return "FormatMessageA failed";
}
std::string ret(buf, size);
LocalFree(buf);
return ret;
}
#endif
struct llama_mmap {
void * addr;
size_t size;
llama_mmap(const llama_mmap &) = delete;
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, bool prefetch = true) {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_SHARED;
#ifdef __linux__
flags |= MAP_POPULATE;
#endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) {
Die("mmap failed: %s", strerror(errno));
}
if (prefetch) {
// Advise the kernel to preload the mapped memory
if (madvise(addr, file->size, MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno));
}
}
}
~llama_mmap() {
munmap(addr, size);
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file, bool prefetch = true) {
size = file->size;
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
DWORD error = GetLastError();
if (hMapping == NULL) {
Die("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
}
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
error = GetLastError();
CloseHandle(hMapping);
if (addr == NULL) {
Die("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
}
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
if (prefetch) {
// Advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr;
range.NumberOfBytes = (SIZE_T)size;
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
}
~llama_mmap() {
if (!UnmapViewOfFile(addr)) {
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
llama_mmap(struct llama_file *) {
Die("mmap not supported");
}
#endif
};
// Represents some region of memory being locked using mlock or VirtualLock;
// will automatically unlock on destruction.
struct llama_mlock {
void * addr = NULL;
size_t size = 0;
bool failed_already = false;
llama_mlock() {}
llama_mlock(const llama_mlock &) = delete;
~llama_mlock() {
if (size) {
raw_unlock(addr, size);
}
}
void init(void * addr) {
LLAMA_ASSERT(this->addr == NULL && this->size == 0);
this->addr = addr;
}
void grow_to(size_t target_size) {
LLAMA_ASSERT(addr);
if (failed_already) {
return;
}
size_t granularity = lock_granularity();
target_size = (target_size + granularity - 1) & ~(granularity - 1);
if (target_size > size) {
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
size = target_size;
} else {
failed_already = true;
}
}
}
#ifdef _POSIX_MEMLOCK_RANGE
static constexpr bool SUPPORTED = true;
size_t lock_granularity() {
return (size_t) sysconf(_SC_PAGESIZE);
}
#ifdef __APPLE__
#define MLOCK_SUGGESTION \
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
#else
#define MLOCK_SUGGESTION \
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
#endif
bool raw_lock(const void * addr, size_t size) {
if (!mlock(addr, size)) {
return true;
} else {
char* errmsg = std::strerror(errno);
bool suggest = (errno == ENOMEM);
// Check if the resource limit is fine after all
struct rlimit lock_limit;
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
suggest = false;
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
suggest = false;
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
return false;
}
}
#undef MLOCK_SUGGESTION
void raw_unlock(void * addr, size_t size) {
if (munlock(addr, size)) {
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
}
}
#elif defined(_WIN32)
static constexpr bool SUPPORTED = true;
size_t lock_granularity() {
SYSTEM_INFO si;
GetSystemInfo(&si);
return (size_t) si.dwPageSize;
}
bool raw_lock(void * addr, size_t size) {
for (int tries = 1; ; tries++) {
if (VirtualLock(addr, size)) {
return true;
}
if (tries == 2) {
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
size, this->size, llama_format_win_err(GetLastError()).c_str());
return false;
}
// It failed but this was only the first try; increase the working
// set size and try again.
SIZE_T min_ws_size, max_ws_size;
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
return false;
}
// Per MSDN: "The maximum number of pages that a process can lock
// is equal to the number of pages in its minimum working set minus
// a small overhead."
// Hopefully a megabyte is enough overhead:
size_t increment = size + 1048576;
// The minimum must be <= the maximum, so we need to increase both:
min_ws_size += increment;
max_ws_size += increment;
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
llama_format_win_err(GetLastError()).c_str());
return false;
}
}
}
void raw_unlock(void * addr, size_t size) {
if (!VirtualUnlock(addr, size)) {
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
llama_format_win_err(GetLastError()).c_str());
}
}
#else
static constexpr bool SUPPORTED = false;
void raw_lock(const void * addr, size_t size) {
fprintf(stderr, "warning: mlock not supported on this system\n");
}
void raw_unlock(const void * addr, size_t size) {}
#endif
};
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
struct llama_buffer {
uint8_t * addr = NULL;
size_t size = 0;
void resize(size_t size) {
delete[] addr;
addr = new uint8_t[size];
this->size = size;
}
~llama_buffer() {
delete[] addr;
}
};
#endif

568
third_party/ggml/main.cc vendored Normal file
View file

@ -0,0 +1,568 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi
llama.cpp
Copyright (c) 2023 Georgi Gerganov
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
asm(".ident\t\"\\n\\n\
llama.cpp (MIT License)\\n\
Copyright (c) 2023 Georgi Gerganov\"");
asm(".include \"libc/disclaimer.inc\"");
// clang-format off
// Defines sigaction on msys:
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "third_party/ggml/common.h"
#include "libc/nexgen32e/x86feature.h"
#include "third_party/ggml/llama.h"
#include "third_party/libcxx/cassert"
#include "third_party/libcxx/cinttypes"
#include "third_party/libcxx/cmath"
#include "third_party/libcxx/cstdio"
#include "third_party/libcxx/cstring"
#include "third_party/libcxx/ctime"
#include "third_party/libcxx/fstream"
#include "third_party/libcxx/iostream"
#include "third_party/libcxx/string"
#include "third_party/libcxx/vector"
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include "libc/calls/calls.h"
#include "libc/calls/sigtimedwait.h"
#include "libc/calls/struct/sigaction.h"
#include "libc/calls/struct/siginfo.h"
#include "libc/sysv/consts/sa.h"
#include "libc/sysv/consts/sicode.h"
#include "libc/sysv/consts/ss.h"
#include "libc/calls/calls.h"
#include "libc/calls/weirdtypes.h"
#include "libc/runtime/pathconf.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/sysconf.h"
#include "libc/sysv/consts/f.h"
#include "libc/sysv/consts/fileno.h"
#include "libc/sysv/consts/o.h"
#include "libc/sysv/consts/ok.h"
#include "libc/time/time.h"
#include "third_party/getopt/getopt.h"
#include "third_party/musl/crypt.h"
#include "third_party/musl/lockf.h"
#elif defined (_WIN32)
#include "libc/calls/calls.h"
#include "libc/calls/sigtimedwait.h"
#include "libc/calls/struct/sigaction.h"
#include "libc/calls/struct/siginfo.h"
#include "libc/sysv/consts/sa.h"
#include "libc/sysv/consts/sicode.h"
#include "libc/sysv/consts/ss.h"
#endif
static console_state con_st;
static llama_context ** g_ctx;
static bool is_interacting = false;
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
void sigint_handler(int signo) {
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
printf("\n"); // this also force flush stdout.
if (signo == SIGINT) {
if (!is_interacting) {
is_interacting=true;
} else {
llama_print_timings(*g_ctx);
_exit(130);
}
}
}
#endif
static int on_missing_feature(const char *name) {
fprintf(stderr, "error: we require %s support in your microprocessor.\n", name);
return 1;
}
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (!X86_HAVE(AVX2)) return on_missing_feature("avx2");
if (!X86_HAVE(AVX)) return on_missing_feature("avx");
if (!X86_HAVE(FMA)) return on_missing_feature("fma");
if (!X86_HAVE(F16C)) return on_missing_feature("f16c");
if (!X86_HAVE(SSE3)) return on_missing_feature("sse3");
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
// save choice to use color for later
// (note for later: this is a slightly awkward choice)
con_st.use_color = params.use_color;
#if defined (_WIN32)
win32_console_init(params.use_color);
#endif
if (params.perplexity) {
printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.embedding) {
printf("\n************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
// params.prompt = R"(// this function checks if the number n is prime
//bool is_prime(int n) {)";
llama_context * ctx;
g_ctx = &ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
if (!params.lora_adapter.empty()) {
int err = llama_apply_lora_from_file(ctx,
params.lora_adapter.c_str(),
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
params.n_threads);
if (err != 0) {
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
return 1;
}
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
// determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
// uncomment the "used_mem" line in llama.cpp to see the results
if (params.mem_test) {
{
const std::vector<llama_token> tmp(params.n_batch, 0);
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
}
{
const std::vector<llama_token> tmp = { 0, };
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
}
llama_print_timings(ctx);
llama_free(ctx);
return 0;
}
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
const int n_ctx = llama_n_ctx(ctx);
if ((int) embd_inp.size() > n_ctx - 4) {
fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
params.n_keep = (int)embd_inp.size();
}
// prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
// in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) {
params.interactive_first = true;
params.antiprompt.push_back("### Instruction:\n\n");
}
// enable interactive mode if reverse prompt or interactive start is specified
if (params.antiprompt.size() != 0 || params.interactive_first) {
params.interactive = true;
}
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
if (params.verbose_prompt) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
}
if (params.n_keep > 0) {
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
}
fprintf(stderr, "'\n");
}
fprintf(stderr, "\n");
}
if (params.interactive) {
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
sigint_action.sa_handler = sigint_handler;
sigemptyset (&sigint_action.sa_mask);
sigint_action.sa_flags = 0;
sigaction(SIGINT, &sigint_action, NULL);
#elif defined (_WIN32)
signal(SIGINT, sigint_handler);
#endif
fprintf(stderr, "%s: interactive mode on.\n", __func__);
if (params.antiprompt.size()) {
for (auto antiprompt : params.antiprompt) {
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
}
}
if (!params.input_prefix.empty()) {
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
}
}
fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
fprintf(stderr, "\n\n");
// TODO: replace with ring-buffer
std::vector<llama_token> last_n_tokens(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
if (params.interactive) {
fprintf(stderr, "== Running in interactive mode. ==\n"
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
" - Press Ctrl+C to interject at any time.\n"
#endif
" - Press Return to return control to LLaMa.\n"
" - If you want to submit another line, end your input in '\\'.\n\n");
is_interacting = params.interactive_first;
}
bool is_antiprompt = false;
bool input_noecho = false;
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
// the first thing we will do is to output the prompt, so set color accordingly
set_console_color(con_st, CONSOLE_COLOR_PROMPT);
std::vector<llama_token> embd;
while (n_remain != 0 || params.interactive) {
// predict
if (embd.size() > 0) {
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() > n_ctx) {
const int n_left = n_past - params.n_keep;
n_past = params.n_keep;
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_str(ctx, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
}
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
n_eval = params.n_batch;
}
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
n_past += n_eval;
}
}
embd.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// out of user input, sample next token
const int32_t top_k = params.top_k;
const float top_p = params.top_p;
const float temp = params.temp;
const float repeat_penalty = params.repeat_penalty;
llama_token id = 0;
{
auto logits = llama_get_logits(ctx);
if (params.ignore_eos) {
logits[llama_token_eos()] = 0;
}
id = llama_sample_top_p_top_k(ctx,
last_n_tokens.data() + n_ctx - params.repeat_last_n,
params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
}
// replace end of text token with newline token when in interactive mode
if (id == llama_token_eos() && params.interactive && !params.instruct) {
id = llama_token_newline.front();
if (params.antiprompt.size() != 0) {
// tokenize and inject first reverse prompt
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
}
}
// add it to the context
embd.push_back(id);
// echo this to console
input_noecho = false;
// decrement remaining sampling budget
--n_remain;
} else {
// some user input remains from prompt or interaction, forward it to processing
while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[n_consumed]);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[n_consumed]);
++n_consumed;
if ((int) embd.size() >= params.n_batch) {
break;
}
}
}
// display text
if (!input_noecho) {
for (auto id : embd) {
printf("%s", llama_token_to_str(ctx, id));
}
fflush(stdout);
}
// reset color to default if we there is no pending user input
if (!input_noecho && (int)embd_inp.size() == n_consumed) {
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
}
// in interactive mode, and not currently processing queued inputs;
// check if we should prompt the user for more
if (params.interactive && (int) embd_inp.size() <= n_consumed) {
// check for reverse prompt
if (params.antiprompt.size()) {
std::string last_output;
for (auto id : last_n_tokens) {
last_output += llama_token_to_str(ctx, id);
}
is_antiprompt = false;
// Check if each of the reverse prompts appears at the end of the output.
for (std::string & antiprompt : params.antiprompt) {
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
is_interacting = true;
is_antiprompt = true;
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
fflush(stdout);
break;
}
}
}
if (n_past > 0 && is_interacting) {
// potentially set color to indicate we are taking user input
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
#if defined (_WIN32)
// Windows: must reactivate sigint handler after each signal
signal(SIGINT, sigint_handler);
#endif
if (params.instruct) {
printf("\n> ");
}
std::string buffer;
if (!params.input_prefix.empty()) {
buffer += params.input_prefix;
printf("%s", buffer.c_str());
}
std::string line;
bool another_line = true;
do {
#if defined(_WIN32)
std::wstring wline;
if (!std::getline(std::wcin, wline)) {
// input stream is bad or EOF received
return 0;
}
win32_utf8_encode(wline, line);
#else
if (!std::getline(std::cin, line)) {
// input stream is bad or EOF received
return 0;
}
#endif
if (line.empty() || line.back() != '\\') {
another_line = false;
} else {
line.pop_back(); // Remove the continue character
}
buffer += line + '\n'; // Append the line to the result
} while (another_line);
// done taking input, reset color
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
// Add tokens to embd only if the input buffer is non-empty
// Entering a empty line lets the user pass control back
if (buffer.length() > 1) {
// instruct mode: insert instruction prefix
if (params.instruct && !is_antiprompt) {
n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
}
auto line_inp = ::llama_tokenize(ctx, buffer, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
// instruct mode: insert response suffix
if (params.instruct) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
}
n_remain -= line_inp.size();
}
input_noecho = true; // do not echo this again
}
if (n_past > 0) {
is_interacting = false;
}
}
// end of text token
if (!embd.empty() && embd.back() == llama_token_eos()) {
if (params.instruct) {
is_interacting = true;
} else {
fprintf(stderr, " [end of text]\n");
break;
}
}
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
n_remain = params.n_predict;
is_interacting = true;
}
}
#if defined (_WIN32)
signal(SIGINT, SIG_DFL);
#endif
llama_print_timings(ctx);
llama_free(ctx);
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
return 0;
}

View file

@ -1066,7 +1066,7 @@ public:
#ifndef _LIBCPP_CXX03_LANG
template <class _Key, class ..._Args>
_LIBCPP_INLINE_VISIBILITY
inline _LIBCPP_INLINE_VISIBILITY
pair<iterator, bool> __emplace_unique_key_args(_Key const& __k, _Args&&... __args);
template <class... _Args>
@ -2104,7 +2104,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_unique_key_args(_Key const&
size_type __bc = bucket_count();
bool __inserted = false;
__next_pointer __nd;
size_t __chash;
size_t __chash = 0;
if (__bc != 0)
{
__chash = __constrain_hash(__hash, __bc);

View file

@ -13,6 +13,7 @@ o/$(MODE)/third_party: \
o/$(MODE)/third_party/finger \
o/$(MODE)/third_party/gdtoa \
o/$(MODE)/third_party/getopt \
o/$(MODE)/third_party/ggml \
o/$(MODE)/third_party/hiredis \
o/$(MODE)/third_party/libcxx \
o/$(MODE)/third_party/linenoise \