mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-02-07 06:53:33 +00:00
Import llama.cpp
https://github.com/ggerganov/llama.cpp 0b2da20538d01926b77ea237dd1c930c4d20b686 See third_party/ggml/README.cosmo for changes
This commit is contained in:
parent
f42089d5c6
commit
e8b43903b2
14 changed files with 18313 additions and 2 deletions
1
Makefile
1
Makefile
|
@ -144,6 +144,7 @@ include libc/stdio/stdio.mk # │
|
|||
include third_party/libcxx/libcxx.mk # │
|
||||
include net/net.mk # │
|
||||
include third_party/vqsort/vqsort.mk # │
|
||||
include third_party/ggml/ggml.mk # │
|
||||
include libc/log/log.mk # │
|
||||
include third_party/bzip2/bzip2.mk # │
|
||||
include dsp/core/core.mk # │
|
||||
|
|
21
third_party/ggml/LICENSE
vendored
Normal file
21
third_party/ggml/LICENSE
vendored
Normal file
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2023 Georgi Gerganov
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
21
third_party/ggml/README.cosmo
vendored
Normal file
21
third_party/ggml/README.cosmo
vendored
Normal file
|
@ -0,0 +1,21 @@
|
|||
DESCRIPTION
|
||||
|
||||
ggml is a machine learning library useful for LLM inference on CPUs
|
||||
|
||||
LICENSE
|
||||
|
||||
MIT
|
||||
|
||||
ORIGIN
|
||||
|
||||
https://github.com/ggerganov/llama.cpp
|
||||
commit 0b2da20538d01926b77ea237dd1c930c4d20b686
|
||||
Author: Stephan Walter <stephan@walter.name>
|
||||
Date: Wed Apr 26 20:26:42 2023 +0000
|
||||
ggml : slightly faster AVX2 implementation for Q5 (#1197)
|
||||
|
||||
LOCAL CHANGES
|
||||
|
||||
- Refactor headers per cosmo convention
|
||||
- Replace code like 'ggjt' with READ32BE("ggjt")
|
||||
- Remove C++ exceptions; use Die() function instead
|
385
third_party/ggml/common.cc
vendored
Normal file
385
third_party/ggml/common.cc
vendored
Normal file
|
@ -0,0 +1,385 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ llama.cpp │
|
||||
│ Copyright (c) 2023 Georgi Gerganov │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
llama.cpp (MIT License)\\n\
|
||||
Copyright (c) 2023 Georgi Gerganov\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
#include "third_party/ggml/common.h"
|
||||
|
||||
#include "third_party/libcxx/cassert"
|
||||
#include "third_party/libcxx/cstring"
|
||||
#include "third_party/libcxx/fstream"
|
||||
#include "third_party/libcxx/string"
|
||||
#include "third_party/libcxx/iterator"
|
||||
#include "third_party/libcxx/algorithm"
|
||||
|
||||
#if defined (_WIN32)
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/calls/struct/flock.h"
|
||||
#include "libc/calls/weirdtypes.h"
|
||||
#include "libc/sysv/consts/at.h"
|
||||
#include "libc/sysv/consts/f.h"
|
||||
#include "libc/sysv/consts/fd.h"
|
||||
#include "libc/sysv/consts/o.h"
|
||||
#include "libc/sysv/consts/posix.h"
|
||||
#include "libc/sysv/consts/s.h"
|
||||
// MISSING #include <io.h>
|
||||
#pragma comment(lib,"kernel32.lib")
|
||||
extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
|
||||
extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
|
||||
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
|
||||
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
|
||||
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
|
||||
extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
|
||||
const wchar_t * lpWideCharStr, int cchWideChar,
|
||||
char * lpMultiByteStr, int cbMultiByte,
|
||||
const char * lpDefaultChar, bool * lpUsedDefaultChar);
|
||||
#define CP_UTF8 65001
|
||||
#endif
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||
// determine sensible default number of threads.
|
||||
// std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
|
||||
#ifdef __linux__
|
||||
std::ifstream cpuinfo("/proc/cpuinfo");
|
||||
params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
|
||||
std::istream_iterator<std::string>(),
|
||||
std::string("processor"));
|
||||
#endif
|
||||
if (params.n_threads == 0) {
|
||||
params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
|
||||
}
|
||||
|
||||
bool invalid_param = false;
|
||||
std::string arg;
|
||||
gpt_params default_params;
|
||||
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
if (arg == "-s" || arg == "--seed") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.seed = std::stoi(argv[i]);
|
||||
} else if (arg == "-t" || arg == "--threads") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
} else if (arg == "-p" || arg == "--prompt") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.prompt = argv[i];
|
||||
} else if (arg == "-f" || arg == "--file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::ifstream file(argv[i]);
|
||||
if (!file) {
|
||||
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||
if (params.prompt.back() == '\n') {
|
||||
params.prompt.pop_back();
|
||||
}
|
||||
} else if (arg == "-n" || arg == "--n_predict") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_predict = std::stoi(argv[i]);
|
||||
} else if (arg == "--top_k") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.top_k = std::stoi(argv[i]);
|
||||
} else if (arg == "-c" || arg == "--ctx_size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
} else if (arg == "--memory_f32") {
|
||||
params.memory_f16 = false;
|
||||
} else if (arg == "--top_p") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.top_p = std::stof(argv[i]);
|
||||
} else if (arg == "--temp") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.temp = std::stof(argv[i]);
|
||||
} else if (arg == "--repeat_last_n") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.repeat_last_n = std::stoi(argv[i]);
|
||||
} else if (arg == "--repeat_penalty") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.repeat_penalty = std::stof(argv[i]);
|
||||
} else if (arg == "-b" || arg == "--batch_size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_batch = std::stoi(argv[i]);
|
||||
params.n_batch = std::min(512, params.n_batch);
|
||||
} else if (arg == "--keep") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_keep = std::stoi(argv[i]);
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.model = argv[i];
|
||||
} else if (arg == "--lora") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_adapter = argv[i];
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--lora-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.lora_base = argv[i];
|
||||
} else if (arg == "-i" || arg == "--interactive") {
|
||||
params.interactive = true;
|
||||
} else if (arg == "--embedding") {
|
||||
params.embedding = true;
|
||||
} else if (arg == "--interactive-first") {
|
||||
params.interactive_first = true;
|
||||
} else if (arg == "-ins" || arg == "--instruct") {
|
||||
params.instruct = true;
|
||||
} else if (arg == "--color") {
|
||||
params.use_color = true;
|
||||
} else if (arg == "--mlock") {
|
||||
params.use_mlock = true;
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--mtest") {
|
||||
params.mem_test = true;
|
||||
} else if (arg == "--verbose-prompt") {
|
||||
params.verbose_prompt = true;
|
||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.antiprompt.push_back(argv[i]);
|
||||
} else if (arg == "--perplexity") {
|
||||
params.perplexity = true;
|
||||
} else if (arg == "--ignore-eos") {
|
||||
params.ignore_eos = true;
|
||||
} else if (arg == "--n_parts") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_parts = std::stoi(argv[i]);
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(0);
|
||||
} else if (arg == "--random-prompt") {
|
||||
params.random_prompt = true;
|
||||
} else if (arg == "--in-prefix") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.input_prefix = argv[i];
|
||||
} else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
if (invalid_param) {
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
gpt_print_usage(argc, argv, default_params);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
||||
fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n");
|
||||
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
||||
fprintf(stderr, " run in interactive mode and poll user input upon seeing PROMPT (can be\n");
|
||||
fprintf(stderr, " specified more than once for multiple prompts).\n");
|
||||
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
||||
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
|
||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
||||
fprintf(stderr, " prompt to start generation with (default: empty)\n");
|
||||
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
|
||||
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
|
||||
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
||||
fprintf(stderr, " prompt file to start generation.\n");
|
||||
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
|
||||
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
|
||||
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", (double)params.top_p);
|
||||
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
|
||||
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
|
||||
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
|
||||
fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
|
||||
fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp);
|
||||
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
|
||||
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
if (llama_mlock_supported()) {
|
||||
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
if (llama_mmap_supported()) {
|
||||
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
|
||||
}
|
||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
|
||||
fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
std::string gpt_random_prompt(std::mt19937 & rng) {
|
||||
const int r = rng() % 10;
|
||||
switch (r) {
|
||||
case 0: return "So";
|
||||
case 1: return "Once upon a time";
|
||||
case 2: return "When";
|
||||
case 3: return "The";
|
||||
case 4: return "After";
|
||||
case 5: return "If";
|
||||
case 6: return "import";
|
||||
case 7: return "He";
|
||||
case 8: return "She";
|
||||
case 9: return "They";
|
||||
default: return "To";
|
||||
}
|
||||
|
||||
return "The";
|
||||
}
|
||||
|
||||
// TODO: not great allocating this every time
|
||||
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
|
||||
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
|
||||
std::vector<llama_token> res(text.size() + (int)add_bos);
|
||||
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
||||
assert(n >= 0);
|
||||
res.resize(n);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
/* Keep track of current color of output, and emit ANSI code if it changes. */
|
||||
void set_console_color(console_state & con_st, console_color_t color) {
|
||||
if (con_st.use_color && con_st.color != color) {
|
||||
switch(color) {
|
||||
case CONSOLE_COLOR_DEFAULT:
|
||||
printf(ANSI_COLOR_RESET);
|
||||
break;
|
||||
case CONSOLE_COLOR_PROMPT:
|
||||
printf(ANSI_COLOR_YELLOW);
|
||||
break;
|
||||
case CONSOLE_COLOR_USER_INPUT:
|
||||
printf(ANSI_BOLD ANSI_COLOR_GREEN);
|
||||
break;
|
||||
}
|
||||
con_st.color = color;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (_WIN32)
|
||||
void win32_console_init(bool enable_color) {
|
||||
unsigned long dwMode = 0;
|
||||
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
|
||||
if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) {
|
||||
hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12)
|
||||
if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) {
|
||||
hConOut = 0;
|
||||
}
|
||||
}
|
||||
if (hConOut) {
|
||||
// Enable ANSI colors on Windows 10+
|
||||
if (enable_color && !(dwMode & 0x4)) {
|
||||
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
|
||||
}
|
||||
// Set console output codepage to UTF8
|
||||
SetConsoleOutputCP(CP_UTF8);
|
||||
}
|
||||
void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
|
||||
if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
|
||||
// Set console input codepage to UTF16
|
||||
_setmode(_fileno(stdin), _O_WTEXT);
|
||||
}
|
||||
}
|
||||
|
||||
// Convert a wide Unicode string to an UTF8 string
|
||||
void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
|
||||
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
|
||||
std::string strTo(size_needed, 0);
|
||||
WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
|
||||
str = strTo;
|
||||
}
|
||||
#endif
|
103
third_party/ggml/common.h
vendored
Normal file
103
third_party/ggml/common.h
vendored
Normal file
|
@ -0,0 +1,103 @@
|
|||
// -*- c++ -*-
|
||||
// clang-format off
|
||||
#ifndef COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
|
||||
#define COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_
|
||||
#include "third_party/ggml/llama.h"
|
||||
#include "third_party/libcxx/string"
|
||||
#include "third_party/libcxx/vector"
|
||||
#include "third_party/libcxx/random"
|
||||
#include "third_party/libcxx/thread"
|
||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
// clang-format off
|
||||
// Various helper functions and utilities
|
||||
|
||||
//
|
||||
// CLI argument parsing
|
||||
//
|
||||
|
||||
struct gpt_params {
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t n_predict = 128; // new tokens to predict
|
||||
int32_t repeat_last_n = 64; // last n tokens to penalize
|
||||
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
|
||||
// sampling parameters
|
||||
int32_t top_k = 40;
|
||||
float top_p = 0.95f;
|
||||
float temp = 0.80f;
|
||||
float repeat_penalty = 1.10f;
|
||||
|
||||
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
||||
std::string prompt = "";
|
||||
std::string input_prefix = ""; // string to prefix user inputs with
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
|
||||
std::string lora_adapter = ""; // lora adapter path
|
||||
std::string lora_base = ""; // base model path for the lora adapter
|
||||
|
||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool interactive_first = false; // wait for user input immediately
|
||||
|
||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||
bool ignore_eos = false; // do not stop generating after eos
|
||||
bool perplexity = false; // compute perplexity over the prompt
|
||||
bool use_mmap = true; // use mmap for faster loads
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool mem_test = false; // compute maximum memory usage
|
||||
bool verbose_prompt = false; // print prompt tokens before generation
|
||||
};
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||
|
||||
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||
|
||||
std::string gpt_random_prompt(std::mt19937 & rng);
|
||||
|
||||
//
|
||||
// Vocab utils
|
||||
//
|
||||
|
||||
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
|
||||
|
||||
//
|
||||
// Console utils
|
||||
//
|
||||
|
||||
#define ANSI_COLOR_RED "\x1b[31m"
|
||||
#define ANSI_COLOR_GREEN "\x1b[32m"
|
||||
#define ANSI_COLOR_YELLOW "\x1b[33m"
|
||||
#define ANSI_COLOR_BLUE "\x1b[34m"
|
||||
#define ANSI_COLOR_MAGENTA "\x1b[35m"
|
||||
#define ANSI_COLOR_CYAN "\x1b[36m"
|
||||
#define ANSI_COLOR_RESET "\x1b[0m"
|
||||
#define ANSI_BOLD "\x1b[1m"
|
||||
|
||||
enum console_color_t {
|
||||
CONSOLE_COLOR_DEFAULT=0,
|
||||
CONSOLE_COLOR_PROMPT,
|
||||
CONSOLE_COLOR_USER_INPUT
|
||||
};
|
||||
|
||||
struct console_state {
|
||||
bool use_color = false;
|
||||
console_color_t color = CONSOLE_COLOR_DEFAULT;
|
||||
};
|
||||
|
||||
void set_console_color(console_state & con_st, console_color_t color);
|
||||
|
||||
#if defined (_WIN32)
|
||||
void win32_console_init(bool enable_color);
|
||||
void win32_utf8_encode(const std::wstring & wstr, std::string & str);
|
||||
#endif
|
||||
|
||||
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
||||
#endif /* COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_ */
|
13137
third_party/ggml/ggml.c
vendored
Normal file
13137
third_party/ggml/ggml.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
889
third_party/ggml/ggml.h
vendored
Normal file
889
third_party/ggml/ggml.h
vendored
Normal file
|
@ -0,0 +1,889 @@
|
|||
// clang-format off
|
||||
#ifndef COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_
|
||||
#define COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_
|
||||
#if !(__ASSEMBLER__ + __LINKER__ + 0)
|
||||
COSMOPOLITAN_C_START_
|
||||
// clang-format off
|
||||
|
||||
//
|
||||
// GGML Tensor Library
|
||||
//
|
||||
// This documentation is still a work in progress.
|
||||
// If you wish some specific topics to be covered, feel free to drop a comment:
|
||||
//
|
||||
// https://github.com/ggerganov/whisper.cpp/issues/40
|
||||
//
|
||||
// ## Overview
|
||||
//
|
||||
// This library implements:
|
||||
//
|
||||
// - a set of tensor operations
|
||||
// - automatic differentiation
|
||||
// - basic optimization algorithms
|
||||
//
|
||||
// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
|
||||
// but is not limited to, the following:
|
||||
//
|
||||
// - linear regression
|
||||
// - support vector machines
|
||||
// - neural networks
|
||||
//
|
||||
// The library allows the user to define a certain function using the available tensor operations. This function
|
||||
// definition is represented internally via a computation graph. Each tensor operation in the function definition
|
||||
// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
|
||||
// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
|
||||
// using one of the available optimization algorithms.
|
||||
//
|
||||
// For example, here we define the function: f(x) = a*x^2 + b
|
||||
//
|
||||
// {
|
||||
// struct ggml_init_params params = {
|
||||
// .mem_size = 16*1024*1024,
|
||||
// .mem_buffer = NULL,
|
||||
// };
|
||||
//
|
||||
// // memory allocation happens here
|
||||
// struct ggml_context * ctx = ggml_init(params);
|
||||
//
|
||||
// struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
||||
//
|
||||
// ggml_set_param(ctx, x); // x is an input variable
|
||||
//
|
||||
// struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
||||
// struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
|
||||
// struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
|
||||
// struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
|
||||
//
|
||||
// ...
|
||||
// }
|
||||
//
|
||||
// Notice that the function definition above does not involve any actual computation. The computation is performed only
|
||||
// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
|
||||
//
|
||||
// {
|
||||
// ...
|
||||
//
|
||||
// struct ggml_cgraph gf = ggml_build_forward(f);
|
||||
//
|
||||
// // set the input variable and parameter values
|
||||
// ggml_set_f32(x, 2.0f);
|
||||
// ggml_set_f32(a, 3.0f);
|
||||
// ggml_set_f32(b, 4.0f);
|
||||
//
|
||||
// ggml_graph_compute(ctx0, &gf);
|
||||
//
|
||||
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
||||
//
|
||||
// ...
|
||||
// }
|
||||
//
|
||||
// The actual computation is performed in the ggml_graph_compute() function.
|
||||
//
|
||||
// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
|
||||
// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
|
||||
// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
|
||||
// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
|
||||
// actually needed.
|
||||
//
|
||||
// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
|
||||
// differentiation and optimization algorithms.
|
||||
//
|
||||
// The described approach allows to define the function graph once and then compute its forward or backward graphs
|
||||
// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
|
||||
// the user can avoid the memory allocation overhead at runtime.
|
||||
//
|
||||
// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
|
||||
// citizens, but in theory the library can be extended to support FP8 and integer data types.
|
||||
//
|
||||
// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
|
||||
// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
|
||||
// clear that the library needs to support more complex operations. The way to support these operations is not clear
|
||||
// yet, but a few examples are demonstrated in the following operations:
|
||||
//
|
||||
// - ggml_permute()
|
||||
// - ggml_conv_1d_1s()
|
||||
// - ggml_conv_1d_2s()
|
||||
//
|
||||
// For each tensor operator, the library implements a forward and backward computation function. The forward function
|
||||
// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
|
||||
// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
|
||||
// calculus class, or watch the following video:
|
||||
//
|
||||
// What is Automatic Differentiation?
|
||||
// https://www.youtube.com/watch?v=wG_nF1awSSY
|
||||
//
|
||||
//
|
||||
// ## Tensor data (struct ggml_tensor)
|
||||
//
|
||||
// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
|
||||
// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
|
||||
// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
|
||||
//
|
||||
// {
|
||||
// struct ggml_tensor * c = ggml_add(ctx, a, b);
|
||||
//
|
||||
// assert(c->src[0] == a);
|
||||
// assert(c->src[1] == b);
|
||||
// }
|
||||
//
|
||||
// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
|
||||
// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
|
||||
// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
|
||||
// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
|
||||
// contiguous in memory.
|
||||
//
|
||||
// The data of the tensor is accessed via the "data" pointer. For example:
|
||||
//
|
||||
// {
|
||||
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
|
||||
//
|
||||
// // a[1, 2] = 1.0f;
|
||||
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
||||
//
|
||||
// // a[2, 0] = 2.0f;
|
||||
// *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
|
||||
//
|
||||
// ...
|
||||
// }
|
||||
//
|
||||
// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
|
||||
//
|
||||
// ## The matrix multiplication operator (ggml_mul_mat)
|
||||
//
|
||||
// TODO
|
||||
//
|
||||
//
|
||||
// ## Multi-threading
|
||||
//
|
||||
// TODO
|
||||
//
|
||||
//
|
||||
// ## Overview of ggml.c
|
||||
//
|
||||
// TODO
|
||||
//
|
||||
//
|
||||
// ## SIMD optimizations
|
||||
//
|
||||
// TODO
|
||||
//
|
||||
//
|
||||
// ## Debugging ggml
|
||||
//
|
||||
// TODO
|
||||
//
|
||||
//
|
||||
|
||||
#ifdef GGML_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef GGML_BUILD
|
||||
# define GGML_API __declspec(dllexport)
|
||||
# else
|
||||
# define GGML_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define GGML_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define GGML_API
|
||||
#endif
|
||||
|
||||
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
||||
#define GGML_FILE_VERSION 1
|
||||
|
||||
#define GGML_MAX_DIMS 4
|
||||
#define GGML_MAX_NODES 4096
|
||||
#define GGML_MAX_PARAMS 16
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_OPT 4
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
// we use the built-in 16-bit float type
|
||||
typedef __fp16 ggml_fp16_t;
|
||||
#else
|
||||
typedef uint16_t ggml_fp16_t;
|
||||
#endif
|
||||
|
||||
// convert FP16 <-> FP32
|
||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||
|
||||
struct ggml_object;
|
||||
struct ggml_context;
|
||||
|
||||
enum ggml_type {
|
||||
GGML_TYPE_F32 = 0,
|
||||
GGML_TYPE_F16 = 1,
|
||||
GGML_TYPE_Q4_0 = 2,
|
||||
GGML_TYPE_Q4_1 = 3,
|
||||
GGML_TYPE_Q4_2 = 4,
|
||||
GGML_TYPE_Q4_3 = 5,
|
||||
GGML_TYPE_Q5_0 = 6,
|
||||
GGML_TYPE_Q5_1 = 7,
|
||||
GGML_TYPE_Q8_0 = 8,
|
||||
GGML_TYPE_Q8_1 = 9,
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
// available tensor operations:
|
||||
enum ggml_op {
|
||||
GGML_OP_NONE = 0,
|
||||
|
||||
GGML_OP_DUP,
|
||||
GGML_OP_ADD,
|
||||
GGML_OP_SUB,
|
||||
GGML_OP_MUL,
|
||||
GGML_OP_DIV,
|
||||
GGML_OP_SQR,
|
||||
GGML_OP_SQRT,
|
||||
GGML_OP_SUM,
|
||||
GGML_OP_MEAN,
|
||||
GGML_OP_REPEAT,
|
||||
GGML_OP_ABS,
|
||||
GGML_OP_SGN,
|
||||
GGML_OP_NEG,
|
||||
GGML_OP_STEP,
|
||||
GGML_OP_RELU,
|
||||
GGML_OP_GELU,
|
||||
GGML_OP_SILU,
|
||||
GGML_OP_NORM, // normalize
|
||||
GGML_OP_RMS_NORM,
|
||||
|
||||
GGML_OP_MUL_MAT,
|
||||
|
||||
GGML_OP_SCALE,
|
||||
GGML_OP_CPY,
|
||||
GGML_OP_CONT,
|
||||
GGML_OP_RESHAPE,
|
||||
GGML_OP_VIEW,
|
||||
GGML_OP_PERMUTE,
|
||||
GGML_OP_TRANSPOSE,
|
||||
GGML_OP_GET_ROWS,
|
||||
GGML_OP_DIAG_MASK_INF,
|
||||
GGML_OP_SOFT_MAX,
|
||||
GGML_OP_ROPE,
|
||||
GGML_OP_CONV_1D_1S,
|
||||
GGML_OP_CONV_1D_2S,
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
GGML_OP_FLASH_FF,
|
||||
|
||||
GGML_OP_MAP_UNARY,
|
||||
GGML_OP_MAP_BINARY,
|
||||
|
||||
GGML_OP_COUNT,
|
||||
};
|
||||
|
||||
|
||||
// ggml object
|
||||
struct ggml_object {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
|
||||
struct ggml_object * next;
|
||||
|
||||
char padding[8];
|
||||
};
|
||||
|
||||
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
|
||||
|
||||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
|
||||
int n_dims;
|
||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||
// nb[0] = sizeof(type)
|
||||
// nb[1] = nb[0] * ne[0] + padding
|
||||
// nb[i] = nb[i-1] * ne[i-1]
|
||||
|
||||
// compute data
|
||||
enum ggml_op op;
|
||||
|
||||
bool is_param;
|
||||
|
||||
struct ggml_tensor * grad;
|
||||
struct ggml_tensor * src0;
|
||||
struct ggml_tensor * src1;
|
||||
struct ggml_tensor * opt[GGML_MAX_OPT];
|
||||
|
||||
// thread scheduling
|
||||
int n_tasks;
|
||||
|
||||
// performance
|
||||
int perf_runs;
|
||||
int64_t perf_cycles;
|
||||
int64_t perf_time_us;
|
||||
|
||||
void * data;
|
||||
char padding[8];
|
||||
};
|
||||
|
||||
// computation graph
|
||||
struct ggml_cgraph {
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
int n_threads;
|
||||
|
||||
size_t work_size;
|
||||
struct ggml_tensor * work;
|
||||
|
||||
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
||||
struct ggml_tensor * grads[GGML_MAX_NODES];
|
||||
struct ggml_tensor * leafs[GGML_MAX_NODES];
|
||||
|
||||
// performance
|
||||
int perf_runs;
|
||||
int64_t perf_cycles;
|
||||
int64_t perf_time_us;
|
||||
};
|
||||
|
||||
// scratch buffer
|
||||
struct ggml_scratch {
|
||||
size_t offs;
|
||||
size_t size;
|
||||
void * data;
|
||||
};
|
||||
|
||||
struct ggml_init_params {
|
||||
// memory pool
|
||||
size_t mem_size; // bytes
|
||||
void * mem_buffer; // if NULL, memory will be allocated internally
|
||||
bool no_alloc; // don't allocate memory for the tensor data
|
||||
};
|
||||
|
||||
// misc
|
||||
|
||||
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
||||
GGML_API int64_t ggml_time_ms(void);
|
||||
GGML_API int64_t ggml_time_us(void);
|
||||
GGML_API int64_t ggml_cycles(void);
|
||||
GGML_API int64_t ggml_cycles_per_ms(void);
|
||||
|
||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API int ggml_blck_size (enum ggml_type type);
|
||||
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
||||
|
||||
GGML_API const char * ggml_type_name(enum ggml_type type);
|
||||
|
||||
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
||||
|
||||
// main
|
||||
|
||||
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||
GGML_API void ggml_free(struct ggml_context * ctx);
|
||||
|
||||
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
||||
|
||||
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_tensor(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int n_dims,
|
||||
const int64_t *ne);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_tensor_1d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_tensor_2d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0,
|
||||
int64_t ne1);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_tensor_3d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_tensor_4d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
|
||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
|
||||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// operations on tensors with backpropagation
|
||||
//
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_dup(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_add(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_add_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_sub(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_mul(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_div(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_sqr(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_sqrt(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// return scalar
|
||||
// TODO: compute sum along rows
|
||||
GGML_API struct ggml_tensor * ggml_sum(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// mean along rows
|
||||
GGML_API struct ggml_tensor * ggml_mean(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// if a is the same shape as b, and a is not parameter, return a
|
||||
// otherwise, return a new tensor: repeat(a) to fit in b
|
||||
GGML_API struct ggml_tensor * ggml_repeat(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_abs(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_sgn(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_neg(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_step(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_relu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// TODO: double-check this computation is correct
|
||||
GGML_API struct ggml_tensor * ggml_gelu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_silu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// normalize along rows
|
||||
// TODO: eps is hardcoded to 1e-5 for now
|
||||
GGML_API struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_rms_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// A: m rows, n columns
|
||||
// B: p rows, n columns (i.e. we transpose it internally)
|
||||
// result is m columns, p rows
|
||||
GGML_API struct ggml_tensor * ggml_mul_mat(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
//
|
||||
// operations on tensors without backpropagation
|
||||
//
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_scale(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// a -> b, return view(b)
|
||||
GGML_API struct ggml_tensor * ggml_cpy(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// make contiguous
|
||||
GGML_API struct ggml_tensor * ggml_cont(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// return view(a), b specifies the new shape
|
||||
// TODO: when we start computing gradient, make a copy instead of view
|
||||
GGML_API struct ggml_tensor * ggml_reshape(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// return view(a)
|
||||
// TODO: when we start computing gradient, make a copy instead of view
|
||||
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1);
|
||||
|
||||
// return view(a)
|
||||
// TODO: when we start computing gradient, make a copy instead of view
|
||||
GGML_API struct ggml_tensor * ggml_reshape_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2);
|
||||
|
||||
// offset in bytes
|
||||
GGML_API struct ggml_tensor * ggml_view_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
size_t offset);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_view_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
size_t nb1, // row stride in bytes
|
||||
size_t offset);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_view_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
size_t nb1, // row stride in bytes
|
||||
size_t nb2, // slice stride in bytes
|
||||
size_t offset);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_permute(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int axis0,
|
||||
int axis1,
|
||||
int axis2,
|
||||
int axis3);
|
||||
|
||||
// alias for ggml_permute(ctx, a, 1, 0, 2, 3)
|
||||
GGML_API struct ggml_tensor * ggml_transpose(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_get_rows(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// set elements above the diagonal to -INF
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past);
|
||||
|
||||
// in-place, returns view(a)
|
||||
GGML_API struct ggml_tensor * ggml_soft_max(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// rotary position embedding
|
||||
// in-place, returns view(a)
|
||||
// if mode & 1 == 1, skip n_past elements
|
||||
// if mode & 2 == 1, GPT-NeoX style
|
||||
// TODO: avoid creating a new tensor every time
|
||||
GGML_API struct ggml_tensor * ggml_rope(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
int n_dims,
|
||||
int mode);
|
||||
|
||||
// padding = 1
|
||||
// TODO: we don't support extra parameters for now
|
||||
// that's why we are hard-coding the stride, padding, and dilation
|
||||
// not great ..
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d_1s(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d_2s(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
bool masked);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_flash_ff(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b0,
|
||||
struct ggml_tensor * b1,
|
||||
struct ggml_tensor * c0,
|
||||
struct ggml_tensor * c1);
|
||||
|
||||
// Mapping operations
|
||||
GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
||||
GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
const ggml_unary_op_f32_t fun);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
const ggml_binary_op_f32_t fun);
|
||||
|
||||
//
|
||||
// automatic differentiation
|
||||
//
|
||||
|
||||
GGML_API void ggml_set_param(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
||||
|
||||
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
||||
|
||||
// print info and performance information for the graph
|
||||
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
||||
|
||||
// dump the graph into a file using the dot format
|
||||
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
||||
|
||||
//
|
||||
// optimization
|
||||
//
|
||||
|
||||
// optimization methods
|
||||
enum ggml_opt_type {
|
||||
GGML_OPT_ADAM,
|
||||
GGML_OPT_LBFGS,
|
||||
};
|
||||
|
||||
// linesearch methods
|
||||
enum ggml_linesearch {
|
||||
GGML_LINESEARCH_DEFAULT = 1,
|
||||
|
||||
GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
|
||||
GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
|
||||
GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
|
||||
};
|
||||
|
||||
// optimization return values
|
||||
enum ggml_opt_result {
|
||||
GGML_OPT_OK = 0,
|
||||
GGML_OPT_DID_NOT_CONVERGE,
|
||||
GGML_OPT_NO_CONTEXT,
|
||||
GGML_OPT_INVALID_WOLFE,
|
||||
GGML_OPT_FAIL,
|
||||
|
||||
GGML_LINESEARCH_FAIL = -128,
|
||||
GGML_LINESEARCH_MINIMUM_STEP,
|
||||
GGML_LINESEARCH_MAXIMUM_STEP,
|
||||
GGML_LINESEARCH_MAXIMUM_ITERATIONS,
|
||||
GGML_LINESEARCH_INVALID_PARAMETERS,
|
||||
};
|
||||
|
||||
// optimization parameters
|
||||
//
|
||||
// see ggml.c (ggml_opt_default_params) for default values
|
||||
//
|
||||
struct ggml_opt_params {
|
||||
enum ggml_opt_type type;
|
||||
|
||||
int n_threads;
|
||||
|
||||
// delta-based convergence test
|
||||
//
|
||||
// if past == 0 - disabled
|
||||
// if past > 0:
|
||||
// stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
|
||||
//
|
||||
int past;
|
||||
float delta;
|
||||
|
||||
// maximum number of iterations without improvement
|
||||
//
|
||||
// if 0 - disabled
|
||||
// if > 0:
|
||||
// assume convergence if no cost improvement in this number of iterations
|
||||
//
|
||||
int max_no_improvement;
|
||||
|
||||
bool print_forward_graph;
|
||||
bool print_backward_graph;
|
||||
|
||||
// ADAM parameters
|
||||
struct {
|
||||
int n_iter;
|
||||
|
||||
float alpha; // learning rate
|
||||
float beta1;
|
||||
float beta2;
|
||||
float eps; // epsilon for numerical stability
|
||||
float eps_f; // epsilon for convergence test
|
||||
float eps_g; // epsilon for convergence test
|
||||
} adam;
|
||||
|
||||
// LBFGS parameters
|
||||
struct {
|
||||
int m; // number of corrections to approximate the inv. Hessian
|
||||
int n_iter;
|
||||
int max_linesearch;
|
||||
|
||||
float eps; // convergence tolerance
|
||||
float ftol; // line search tolerance
|
||||
float wolfe;
|
||||
float min_step;
|
||||
float max_step;
|
||||
|
||||
enum ggml_linesearch linesearch;
|
||||
} lbfgs;
|
||||
};
|
||||
|
||||
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
||||
|
||||
// optimize the function defined by the tensor f
|
||||
GGML_API enum ggml_opt_result ggml_opt(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_opt_params params,
|
||||
struct ggml_tensor * f);
|
||||
|
||||
//
|
||||
// quantization
|
||||
//
|
||||
|
||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
||||
|
||||
//
|
||||
// system info
|
||||
//
|
||||
|
||||
GGML_API int ggml_cpu_has_avx (void);
|
||||
GGML_API int ggml_cpu_has_avx2 (void);
|
||||
GGML_API int ggml_cpu_has_avx512 (void);
|
||||
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
||||
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
||||
GGML_API int ggml_cpu_has_fma (void);
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_API int ggml_cpu_has_f16c (void);
|
||||
GGML_API int ggml_cpu_has_fp16_va (void);
|
||||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
||||
GGML_API int ggml_cpu_has_blas (void);
|
||||
GGML_API int ggml_cpu_has_cublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
|
||||
|
||||
//
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
#define GGML_RESTRICT
|
||||
#else
|
||||
#define GGML_RESTRICT restrict
|
||||
#endif
|
||||
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||
typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||
typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
||||
|
||||
typedef struct {
|
||||
dequantize_row_q_t dequantize_row_q;
|
||||
quantize_row_q_t quantize_row_q;
|
||||
quantize_row_q_t quantize_row_q_reference;
|
||||
quantize_row_q_t quantize_row_q_dot;
|
||||
vec_dot_q_t vec_dot_q;
|
||||
enum ggml_type vec_dot_type;
|
||||
} quantize_fns_t;
|
||||
|
||||
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
||||
|
||||
COSMOPOLITAN_C_END_
|
||||
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
|
||||
#endif /* COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_ */
|
114
third_party/ggml/ggml.mk
vendored
Normal file
114
third_party/ggml/ggml.mk
vendored
Normal file
|
@ -0,0 +1,114 @@
|
|||
#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
|
||||
#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘
|
||||
|
||||
PKGS += THIRD_PARTY_GGML
|
||||
|
||||
################################################################################
|
||||
# single file machine learning framework written in c
|
||||
# make -j8 o//third_party/ggml/ggml.a
|
||||
|
||||
THIRD_PARTY_GGML_ARTIFACTS += THIRD_PARTY_GGML_A
|
||||
THIRD_PARTY_GGML = $(THIRD_PARTY_GGML_A_DEPS) $(THIRD_PARTY_GGML_A)
|
||||
THIRD_PARTY_GGML_A = o/$(MODE)/third_party/ggml/ggml.a
|
||||
THIRD_PARTY_GGML_A_HDRS = third_party/ggml/ggml.h
|
||||
THIRD_PARTY_GGML_A_SRCS = third_party/ggml/ggml.c
|
||||
THIRD_PARTY_GGML_A_OBJS = $(THIRD_PARTY_GGML_A_SRCS:%.c=o/$(MODE)/%.o)
|
||||
THIRD_PARTY_GGML_A_FILES = $(THIRD_PARTY_GGML_A_SRCS) $(THIRD_PARTY_GGML_A_HDRS)
|
||||
THIRD_PARTY_GGML_A_CHECKS = $(THIRD_PARTY_GGML_A).pkg $(THIRD_PARTY_GGML_A_HDRS:%=o/$(MODE)/%.ok)
|
||||
|
||||
THIRD_PARTY_GGML_A_DIRECTDEPS = \
|
||||
LIBC_CALLS \
|
||||
LIBC_INTRIN \
|
||||
LIBC_MEM \
|
||||
LIBC_NEXGEN32E \
|
||||
LIBC_RUNTIME \
|
||||
LIBC_STDIO \
|
||||
LIBC_THREAD \
|
||||
LIBC_STR \
|
||||
LIBC_STUBS \
|
||||
LIBC_SYSV \
|
||||
LIBC_TINYMATH
|
||||
|
||||
THIRD_PARTY_GGML_A_DEPS := \
|
||||
$(call uniq,$(foreach x,$(THIRD_PARTY_GGML_A_DIRECTDEPS),$($(x))))
|
||||
|
||||
$(THIRD_PARTY_GGML_A): \
|
||||
third_party/ggml/ \
|
||||
$(THIRD_PARTY_GGML_A).pkg \
|
||||
$(THIRD_PARTY_GGML_A_OBJS)
|
||||
|
||||
$(THIRD_PARTY_GGML_A).pkg: \
|
||||
$(THIRD_PARTY_GGML_A_OBJS) \
|
||||
$(foreach x,$(THIRD_PARTY_GGML_A_DIRECTDEPS),$($(x)_A).pkg)
|
||||
|
||||
$(THIRD_PARTY_GGML_A_OBJS): private \
|
||||
OVERRIDE_CFLAGS += \
|
||||
-O3 \
|
||||
-ffunction-sections \
|
||||
-fdata-sections \
|
||||
-msse3 \
|
||||
-mavx \
|
||||
-mavx2 \
|
||||
-mf16c \
|
||||
-mfma
|
||||
|
||||
################################################################################
|
||||
# command for running inference on large language models
|
||||
# make -j8 o//third_party/ggml/llama.com
|
||||
|
||||
THIRD_PARTY_GGML_ARTIFACTS += THIRD_PARTY_GGML_LLAMA
|
||||
THIRD_PARTY_GGML_LLAMA = o/$(MODE)/third_party/ggml/llama.com
|
||||
THIRD_PARTY_GGML_LLAMA_HDRS = third_party/ggml/llama.h third_party/ggml/llama_util.h third_party/ggml/common.h
|
||||
THIRD_PARTY_GGML_LLAMA_SRCS = third_party/ggml/main.cc third_party/ggml/llama.cc third_party/ggml/common.cc
|
||||
THIRD_PARTY_GGML_LLAMA_OBJS = $(THIRD_PARTY_GGML_LLAMA_SRCS:%.cc=o/$(MODE)/%.o)
|
||||
THIRD_PARTY_GGML_LLAMA_FILES := $(THIRD_PARTY_GGML_LLAMA_SRCS) $(THIRD_PARTY_GGML_LLAMA_HDRS)
|
||||
THIRD_PARTY_GGML_LLAMA_CHECKS = $(THIRD_PARTY_GGML_LLAMA).pkg $(THIRD_PARTY_GGML_LLAMA_HDRS:%=o/$(MODE)/%.okk)
|
||||
|
||||
THIRD_PARTY_GGML_LLAMA_DIRECTDEPS = \
|
||||
LIBC_CALLS \
|
||||
LIBC_FMT \
|
||||
LIBC_INTRIN \
|
||||
LIBC_MEM \
|
||||
LIBC_NEXGEN32E \
|
||||
LIBC_RUNTIME \
|
||||
LIBC_STDIO \
|
||||
LIBC_STR \
|
||||
LIBC_STUBS \
|
||||
LIBC_SYSV \
|
||||
LIBC_THREAD \
|
||||
LIBC_TINYMATH \
|
||||
THIRD_PARTY_GGML \
|
||||
THIRD_PARTY_LIBCXX
|
||||
|
||||
THIRD_PARTY_GGML_LLAMA_DEPS := \
|
||||
$(call uniq,$(foreach x,$(THIRD_PARTY_GGML_LLAMA_DIRECTDEPS),$($(x))))
|
||||
|
||||
$(THIRD_PARTY_GGML_LLAMA).dbg: \
|
||||
$(THIRD_PARTY_GGML_LLAMA).pkg \
|
||||
$(THIRD_PARTY_GGML_LLAMA_DEPS) \
|
||||
o/$(MODE)/third_party/ggml/common.o \
|
||||
o/$(MODE)/third_party/ggml/llama.o \
|
||||
o/$(MODE)/third_party/ggml/main.o \
|
||||
$(CRT) \
|
||||
$(APE_NO_MODIFY_SELF)
|
||||
@$(APELINK)
|
||||
|
||||
$(THIRD_PARTY_GGML_LLAMA).pkg: \
|
||||
$(THIRD_PARTY_GGML_LLAMA_OBJS) \
|
||||
$(foreach x,$(THIRD_PARTY_GGML_LLAMA_DIRECTDEPS),$($(x)_A).pkg)
|
||||
|
||||
################################################################################
|
||||
|
||||
THIRD_PARTY_GGML_COMS = $(THIRD_PARTY_GGML_LLAMA)
|
||||
THIRD_PARTY_GGML_BINS = $(THIRD_PARTY_GGML_COMS) $(THIRD_PARTY_GGML_COMS:%=%.dbg)
|
||||
THIRD_PARTY_GGML_LIBS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)))
|
||||
THIRD_PARTY_GGML_SRCS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_SRCS))
|
||||
THIRD_PARTY_GGML_HDRS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_HDRS))
|
||||
THIRD_PARTY_GGML_OBJS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_OBJS))
|
||||
THIRD_PARTY_GGML_CHECKS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_CHECKS))
|
||||
$(THIRD_PARTY_GGML_OBJS): third_party/ggml/ggml.mk
|
||||
|
||||
.PHONY: o/$(MODE)/third_party/ggml
|
||||
o/$(MODE)/third_party/ggml: \
|
||||
$(THIRD_PARTY_GGML_BINS) \
|
||||
$(THIRD_PARTY_GGML_CHECKS)
|
2472
third_party/ggml/llama.cc
vendored
Normal file
2472
third_party/ggml/llama.cc
vendored
Normal file
File diff suppressed because it is too large
Load diff
211
third_party/ggml/llama.h
vendored
Normal file
211
third_party/ggml/llama.h
vendored
Normal file
|
@ -0,0 +1,211 @@
|
|||
// -*- c++ -*-
|
||||
// clang-format off
|
||||
#ifndef LLAMA_H
|
||||
#define LLAMA_H
|
||||
|
||||
#ifdef LLAMA_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef LLAMA_BUILD
|
||||
# define LLAMA_API __declspec(dllexport)
|
||||
# else
|
||||
# define LLAMA_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define LLAMA_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define LLAMA_API
|
||||
#endif
|
||||
|
||||
#define LLAMA_FILE_VERSION 1
|
||||
#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
|
||||
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// C interface
|
||||
//
|
||||
// TODO: show sample usage
|
||||
//
|
||||
|
||||
struct llama_context;
|
||||
|
||||
typedef int llama_token;
|
||||
|
||||
typedef struct llama_token_data {
|
||||
llama_token id; // token id
|
||||
|
||||
float p; // probability of the token
|
||||
float plog; // log probability of the token
|
||||
|
||||
} llama_token_data;
|
||||
|
||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
||||
|
||||
struct llama_context_params {
|
||||
int n_ctx; // text context
|
||||
int n_parts; // -1 for default
|
||||
int seed; // RNG seed, 0 for random
|
||||
|
||||
bool f16_kv; // use fp16 for KV cache
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mmap; // use mmap if possible
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool embedding; // embedding mode only
|
||||
|
||||
// called with a progress value between 0 and 1, pass NULL to disable
|
||||
llama_progress_callback progress_callback;
|
||||
// context pointer passed to the progress callback
|
||||
void * progress_callback_user_data;
|
||||
};
|
||||
|
||||
// model file types
|
||||
enum llama_ftype {
|
||||
LLAMA_FTYPE_ALL_F32 = 0,
|
||||
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
|
||||
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
|
||||
LLAMA_API bool llama_mmap_supported();
|
||||
LLAMA_API bool llama_mlock_supported();
|
||||
|
||||
// Various functions for loading a ggml llama model.
|
||||
// Allocate (almost) all memory needed for the model.
|
||||
// Return NULL on failure
|
||||
LLAMA_API struct llama_context * llama_init_from_file(
|
||||
const char * path_model,
|
||||
struct llama_context_params params);
|
||||
|
||||
// Frees all allocated memory
|
||||
LLAMA_API void llama_free(struct llama_context * ctx);
|
||||
|
||||
// TODO: not great API - very likely to change
|
||||
// Returns 0 on success
|
||||
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
|
||||
LLAMA_API int llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
const char * fname_out,
|
||||
enum llama_ftype ftype,
|
||||
int nthread);
|
||||
|
||||
// Apply a LoRA adapter to a loaded model
|
||||
// path_base_model is the path to a higher quality model to use as a base for
|
||||
// the layers modified by the adapter. Can be NULL to use the current loaded model.
|
||||
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
|
||||
// will be applied on top of the previous one
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_apply_lora_from_file(
|
||||
struct llama_context * ctx,
|
||||
const char * path_lora,
|
||||
const char * path_base_model,
|
||||
int n_threads);
|
||||
|
||||
// Returns the number of tokens in the KV cache
|
||||
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
||||
|
||||
// Sets the current rng seed.
|
||||
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
|
||||
|
||||
// Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
|
||||
LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
|
||||
|
||||
// Copies the state to the specified destination address.
|
||||
// Destination needs to have allocated enough memory.
|
||||
// Returns the number of bytes copied
|
||||
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
|
||||
|
||||
// Set the state reading from the specified address
|
||||
// Returns the number of bytes read
|
||||
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
|
||||
|
||||
// Run the llama inference to obtain the logits and probabilities for the next token.
|
||||
// tokens + n_tokens is the provided batch of new tokens to process
|
||||
// n_past is the number of tokens to use from previous eval calls
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_eval(
|
||||
struct llama_context * ctx,
|
||||
const llama_token * tokens,
|
||||
int n_tokens,
|
||||
int n_past,
|
||||
int n_threads);
|
||||
|
||||
// Convert the provided text into tokens.
|
||||
// The tokens pointer must be large enough to hold the resulting tokens.
|
||||
// Returns the number of tokens on success, no more than n_max_tokens
|
||||
// Returns a negative number on failure - the number of tokens that would have been returned
|
||||
// TODO: not sure if correct
|
||||
LLAMA_API int llama_tokenize(
|
||||
struct llama_context * ctx,
|
||||
const char * text,
|
||||
llama_token * tokens,
|
||||
int n_max_tokens,
|
||||
bool add_bos);
|
||||
|
||||
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
|
||||
LLAMA_API int llama_n_embd (struct llama_context * ctx);
|
||||
|
||||
// Token logits obtained from the last call to llama_eval()
|
||||
// The logits for the last token are stored in the last row
|
||||
// Can be mutated in order to change the probabilities of the next token
|
||||
// Rows: n_tokens
|
||||
// Cols: n_vocab
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the input
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos();
|
||||
LLAMA_API llama_token llama_token_eos();
|
||||
|
||||
// TODO: improve the last_n_tokens interface ?
|
||||
LLAMA_API llama_token llama_sample_top_p_top_k(
|
||||
struct llama_context * ctx,
|
||||
const llama_token * last_n_tokens_data,
|
||||
int last_n_tokens_size,
|
||||
int top_k,
|
||||
float top_p,
|
||||
float temp,
|
||||
float repeat_penalty);
|
||||
|
||||
// Performance information
|
||||
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
||||
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
||||
|
||||
// Print system information
|
||||
LLAMA_API const char * llama_print_system_info(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
||||
#ifdef LLAMA_API_INTERNAL
|
||||
|
||||
#include "third_party/libcxx/vector"
|
||||
#include "third_party/libcxx/string"
|
||||
struct ggml_tensor;
|
||||
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
|
||||
#endif
|
||||
|
||||
#endif // LLAMA_H
|
388
third_party/ggml/llama_util.h
vendored
Executable file
388
third_party/ggml/llama_util.h
vendored
Executable file
|
@ -0,0 +1,388 @@
|
|||
// Internal header to be included only by llama.cpp.
|
||||
// Contains wrappers around OS interfaces.
|
||||
|
||||
#ifndef LLAMA_UTIL_H
|
||||
#define LLAMA_UTIL_H
|
||||
#include "libc/calls/struct/rlimit.h"
|
||||
#include "libc/fmt/fmt.h"
|
||||
#include "libc/runtime/sysconf.h"
|
||||
#include "libc/sysv/consts/madv.h"
|
||||
#include "libc/sysv/consts/map.h"
|
||||
#include "libc/sysv/consts/prot.h"
|
||||
#include "libc/sysv/consts/rlimit.h"
|
||||
#include "third_party/libcxx/cerrno"
|
||||
#include "third_party/libcxx/climits"
|
||||
#include "third_party/libcxx/cstdarg"
|
||||
#include "third_party/libcxx/cstdint"
|
||||
#include "third_party/libcxx/cstdio"
|
||||
#include "third_party/libcxx/cstdlib"
|
||||
#include "third_party/libcxx/cstring"
|
||||
#include "third_party/libcxx/string"
|
||||
#include "third_party/libcxx/vector"
|
||||
// clang-format off
|
||||
|
||||
#define LLAMA_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
__attribute__((__format__(__gnu_printf__, 1, 2)))
|
||||
#else
|
||||
__attribute__((__format__(__printf__, 1, 2)))
|
||||
#endif
|
||||
__attribute__((__noreturn__))
|
||||
#endif
|
||||
static void Die(const char *fmt, ...) {
|
||||
va_list va;
|
||||
va_start(va, fmt);
|
||||
vfprintf(stderr, fmt, va);
|
||||
va_end(va);
|
||||
fputc('\n', stderr);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
struct llama_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
size_t size;
|
||||
|
||||
llama_file(const char * fname, const char * mode) {
|
||||
fp = std::fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
Die("failed to open %s: %s", fname, std::strerror(errno));
|
||||
}
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
#ifdef _WIN32
|
||||
__int64 ret = _ftelli64(fp);
|
||||
#else
|
||||
long ret = std::ftell(fp);
|
||||
#endif
|
||||
LLAMA_ASSERT(ret != -1); // this really shouldn't fail
|
||||
return (size_t) ret;
|
||||
}
|
||||
|
||||
void seek(size_t offset, int whence) {
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
||||
#else
|
||||
int ret = std::fseek(fp, (long) offset, whence);
|
||||
#endif
|
||||
LLAMA_ASSERT(ret == 0); // same
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
std::size_t ret = std::fread(ptr, size, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
Die("read error: %s", strerror(errno));
|
||||
}
|
||||
if (ret != 1) {
|
||||
Die("unexpectedly reached end of file");
|
||||
}
|
||||
}
|
||||
|
||||
std::uint32_t read_u32() {
|
||||
std::uint32_t ret;
|
||||
read_raw(&ret, sizeof(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string read_string(std::uint32_t len) {
|
||||
std::vector<char> chars(len);
|
||||
read_raw(chars.data(), len);
|
||||
return std::string(chars.data(), len);
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = std::fwrite(ptr, size, 1, fp);
|
||||
if (ret != 1) {
|
||||
Die("write error: %s", strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
void write_u32(std::uint32_t val) {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
~llama_file() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(_WIN32)
|
||||
static std::string llama_format_win_err(DWORD err) {
|
||||
LPSTR buf;
|
||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
||||
if (!size) {
|
||||
return "FormatMessageA failed";
|
||||
}
|
||||
std::string ret(buf, size);
|
||||
LocalFree(buf);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
struct llama_mmap {
|
||||
void * addr;
|
||||
size_t size;
|
||||
|
||||
llama_mmap(const llama_mmap &) = delete;
|
||||
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_SHARED;
|
||||
#ifdef __linux__
|
||||
flags |= MAP_POPULATE;
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||
if (addr == MAP_FAILED) {
|
||||
Die("mmap failed: %s", strerror(errno));
|
||||
}
|
||||
|
||||
if (prefetch) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~llama_mmap() {
|
||||
munmap(addr, size);
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_mmap(struct llama_file * file, bool prefetch = true) {
|
||||
size = file->size;
|
||||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
||||
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
DWORD error = GetLastError();
|
||||
|
||||
if (hMapping == NULL) {
|
||||
Die("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
|
||||
}
|
||||
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
||||
error = GetLastError();
|
||||
CloseHandle(hMapping);
|
||||
|
||||
if (addr == NULL) {
|
||||
Die("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
||||
}
|
||||
|
||||
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
if (prefetch) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T)size;
|
||||
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
||||
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
}
|
||||
|
||||
~llama_mmap() {
|
||||
if (!UnmapViewOfFile(addr)) {
|
||||
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
llama_mmap(struct llama_file *) {
|
||||
Die("mmap not supported");
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Represents some region of memory being locked using mlock or VirtualLock;
|
||||
// will automatically unlock on destruction.
|
||||
struct llama_mlock {
|
||||
void * addr = NULL;
|
||||
size_t size = 0;
|
||||
bool failed_already = false;
|
||||
|
||||
llama_mlock() {}
|
||||
llama_mlock(const llama_mlock &) = delete;
|
||||
|
||||
~llama_mlock() {
|
||||
if (size) {
|
||||
raw_unlock(addr, size);
|
||||
}
|
||||
}
|
||||
|
||||
void init(void * addr) {
|
||||
LLAMA_ASSERT(this->addr == NULL && this->size == 0);
|
||||
this->addr = addr;
|
||||
}
|
||||
|
||||
void grow_to(size_t target_size) {
|
||||
LLAMA_ASSERT(addr);
|
||||
if (failed_already) {
|
||||
return;
|
||||
}
|
||||
size_t granularity = lock_granularity();
|
||||
target_size = (target_size + granularity - 1) & ~(granularity - 1);
|
||||
if (target_size > size) {
|
||||
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
|
||||
size = target_size;
|
||||
} else {
|
||||
failed_already = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _POSIX_MEMLOCK_RANGE
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
size_t lock_granularity() {
|
||||
return (size_t) sysconf(_SC_PAGESIZE);
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
||||
#else
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
||||
#endif
|
||||
|
||||
bool raw_lock(const void * addr, size_t size) {
|
||||
if (!mlock(addr, size)) {
|
||||
return true;
|
||||
} else {
|
||||
char* errmsg = std::strerror(errno);
|
||||
bool suggest = (errno == ENOMEM);
|
||||
|
||||
// Check if the resource limit is fine after all
|
||||
struct rlimit lock_limit;
|
||||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
||||
suggest = false;
|
||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
||||
suggest = false;
|
||||
|
||||
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
||||
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#undef MLOCK_SUGGESTION
|
||||
|
||||
void raw_unlock(void * addr, size_t size) {
|
||||
if (munlock(addr, size)) {
|
||||
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
||||
}
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
size_t lock_granularity() {
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si);
|
||||
return (size_t) si.dwPageSize;
|
||||
}
|
||||
|
||||
bool raw_lock(void * addr, size_t size) {
|
||||
for (int tries = 1; ; tries++) {
|
||||
if (VirtualLock(addr, size)) {
|
||||
return true;
|
||||
}
|
||||
if (tries == 2) {
|
||||
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
||||
size, this->size, llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// It failed but this was only the first try; increase the working
|
||||
// set size and try again.
|
||||
SIZE_T min_ws_size, max_ws_size;
|
||||
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
||||
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
// Per MSDN: "The maximum number of pages that a process can lock
|
||||
// is equal to the number of pages in its minimum working set minus
|
||||
// a small overhead."
|
||||
// Hopefully a megabyte is enough overhead:
|
||||
size_t increment = size + 1048576;
|
||||
// The minimum must be <= the maximum, so we need to increase both:
|
||||
min_ws_size += increment;
|
||||
max_ws_size += increment;
|
||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
||||
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void raw_unlock(void * addr, size_t size) {
|
||||
if (!VirtualUnlock(addr, size)) {
|
||||
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
||||
llama_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
void raw_lock(const void * addr, size_t size) {
|
||||
fprintf(stderr, "warning: mlock not supported on this system\n");
|
||||
}
|
||||
|
||||
void raw_unlock(const void * addr, size_t size) {}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
||||
struct llama_buffer {
|
||||
uint8_t * addr = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
void resize(size_t size) {
|
||||
delete[] addr;
|
||||
addr = new uint8_t[size];
|
||||
this->size = size;
|
||||
}
|
||||
|
||||
~llama_buffer() {
|
||||
delete[] addr;
|
||||
}
|
||||
};
|
||||
#endif
|
568
third_party/ggml/main.cc
vendored
Normal file
568
third_party/ggml/main.cc
vendored
Normal file
|
@ -0,0 +1,568 @@
|
|||
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
|
||||
│vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi│
|
||||
╚──────────────────────────────────────────────────────────────────────────────╝
|
||||
│ │
|
||||
│ llama.cpp │
|
||||
│ Copyright (c) 2023 Georgi Gerganov │
|
||||
│ │
|
||||
│ Permission is hereby granted, free of charge, to any person obtaining │
|
||||
│ a copy of this software and associated documentation files (the │
|
||||
│ "Software"), to deal in the Software without restriction, including │
|
||||
│ without limitation the rights to use, copy, modify, merge, publish, │
|
||||
│ distribute, sublicense, and/or sell copies of the Software, and to │
|
||||
│ permit persons to whom the Software is furnished to do so, subject to │
|
||||
│ the following conditions: │
|
||||
│ │
|
||||
│ The above copyright notice and this permission notice shall be │
|
||||
│ included in all copies or substantial portions of the Software. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, │
|
||||
│ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF │
|
||||
│ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. │
|
||||
│ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY │
|
||||
│ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, │
|
||||
│ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE │
|
||||
│ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. │
|
||||
│ │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
|
||||
asm(".ident\t\"\\n\\n\
|
||||
llama.cpp (MIT License)\\n\
|
||||
Copyright (c) 2023 Georgi Gerganov\"");
|
||||
asm(".include \"libc/disclaimer.inc\"");
|
||||
// clang-format off
|
||||
|
||||
// Defines sigaction on msys:
|
||||
#ifndef _GNU_SOURCE
|
||||
#define _GNU_SOURCE
|
||||
#endif
|
||||
|
||||
#include "third_party/ggml/common.h"
|
||||
#include "libc/nexgen32e/x86feature.h"
|
||||
#include "third_party/ggml/llama.h"
|
||||
|
||||
#include "third_party/libcxx/cassert"
|
||||
#include "third_party/libcxx/cinttypes"
|
||||
#include "third_party/libcxx/cmath"
|
||||
#include "third_party/libcxx/cstdio"
|
||||
#include "third_party/libcxx/cstring"
|
||||
#include "third_party/libcxx/ctime"
|
||||
#include "third_party/libcxx/fstream"
|
||||
#include "third_party/libcxx/iostream"
|
||||
#include "third_party/libcxx/string"
|
||||
#include "third_party/libcxx/vector"
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/calls/sigtimedwait.h"
|
||||
#include "libc/calls/struct/sigaction.h"
|
||||
#include "libc/calls/struct/siginfo.h"
|
||||
#include "libc/sysv/consts/sa.h"
|
||||
#include "libc/sysv/consts/sicode.h"
|
||||
#include "libc/sysv/consts/ss.h"
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/calls/weirdtypes.h"
|
||||
#include "libc/runtime/pathconf.h"
|
||||
#include "libc/runtime/runtime.h"
|
||||
#include "libc/runtime/sysconf.h"
|
||||
#include "libc/sysv/consts/f.h"
|
||||
#include "libc/sysv/consts/fileno.h"
|
||||
#include "libc/sysv/consts/o.h"
|
||||
#include "libc/sysv/consts/ok.h"
|
||||
#include "libc/time/time.h"
|
||||
#include "third_party/getopt/getopt.h"
|
||||
#include "third_party/musl/crypt.h"
|
||||
#include "third_party/musl/lockf.h"
|
||||
#elif defined (_WIN32)
|
||||
#include "libc/calls/calls.h"
|
||||
#include "libc/calls/sigtimedwait.h"
|
||||
#include "libc/calls/struct/sigaction.h"
|
||||
#include "libc/calls/struct/siginfo.h"
|
||||
#include "libc/sysv/consts/sa.h"
|
||||
#include "libc/sysv/consts/sicode.h"
|
||||
#include "libc/sysv/consts/ss.h"
|
||||
#endif
|
||||
|
||||
static console_state con_st;
|
||||
static llama_context ** g_ctx;
|
||||
|
||||
static bool is_interacting = false;
|
||||
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
void sigint_handler(int signo) {
|
||||
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
|
||||
printf("\n"); // this also force flush stdout.
|
||||
if (signo == SIGINT) {
|
||||
if (!is_interacting) {
|
||||
is_interacting=true;
|
||||
} else {
|
||||
llama_print_timings(*g_ctx);
|
||||
_exit(130);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static int on_missing_feature(const char *name) {
|
||||
fprintf(stderr, "error: we require %s support in your microprocessor.\n", name);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
params.model = "models/llama-7B/ggml-model.bin";
|
||||
|
||||
if (!X86_HAVE(AVX2)) return on_missing_feature("avx2");
|
||||
if (!X86_HAVE(AVX)) return on_missing_feature("avx");
|
||||
if (!X86_HAVE(FMA)) return on_missing_feature("fma");
|
||||
if (!X86_HAVE(F16C)) return on_missing_feature("f16c");
|
||||
if (!X86_HAVE(SSE3)) return on_missing_feature("sse3");
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// save choice to use color for later
|
||||
// (note for later: this is a slightly awkward choice)
|
||||
con_st.use_color = params.use_color;
|
||||
|
||||
#if defined (_WIN32)
|
||||
win32_console_init(params.use_color);
|
||||
#endif
|
||||
|
||||
if (params.perplexity) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
|
||||
printf("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.embedding) {
|
||||
printf("\n************\n");
|
||||
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
|
||||
printf("************\n\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.n_ctx > 2048) {
|
||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
||||
"expect poor results\n", __func__, params.n_ctx);
|
||||
}
|
||||
|
||||
if (params.seed <= 0) {
|
||||
params.seed = time(NULL);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
||||
|
||||
std::mt19937 rng(params.seed);
|
||||
if (params.random_prompt) {
|
||||
params.prompt = gpt_random_prompt(rng);
|
||||
}
|
||||
|
||||
// params.prompt = R"(// this function checks if the number n is prime
|
||||
//bool is_prime(int n) {)";
|
||||
|
||||
llama_context * ctx;
|
||||
g_ctx = &ctx;
|
||||
|
||||
// load the model
|
||||
{
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_parts = params.n_parts;
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.use_mmap = params.use_mmap;
|
||||
lparams.use_mlock = params.use_mlock;
|
||||
|
||||
ctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.lora_adapter.empty()) {
|
||||
int err = llama_apply_lora_from_file(ctx,
|
||||
params.lora_adapter.c_str(),
|
||||
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
||||
params.n_threads);
|
||||
if (err != 0) {
|
||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// print system information
|
||||
{
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
|
||||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||
}
|
||||
|
||||
// determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
|
||||
// uncomment the "used_mem" line in llama.cpp to see the results
|
||||
if (params.mem_test) {
|
||||
{
|
||||
const std::vector<llama_token> tmp(params.n_batch, 0);
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||
}
|
||||
|
||||
{
|
||||
const std::vector<llama_token> tmp = { 0, };
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||
params.prompt.insert(0, 1, ' ');
|
||||
|
||||
// tokenize the prompt
|
||||
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
|
||||
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
||||
if ((int) embd_inp.size() > n_ctx - 4) {
|
||||
fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// number of tokens to keep when resetting context
|
||||
if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
|
||||
params.n_keep = (int)embd_inp.size();
|
||||
}
|
||||
|
||||
// prefix & suffix for instruct mode
|
||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
|
||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
||||
|
||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
if (params.instruct) {
|
||||
params.interactive_first = true;
|
||||
params.antiprompt.push_back("### Instruction:\n\n");
|
||||
}
|
||||
|
||||
// enable interactive mode if reverse prompt or interactive start is specified
|
||||
if (params.antiprompt.size() != 0 || params.interactive_first) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
// determine newline token
|
||||
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
|
||||
|
||||
if (params.verbose_prompt) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
|
||||
}
|
||||
if (params.n_keep > 0) {
|
||||
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
|
||||
for (int i = 0; i < params.n_keep; i++) {
|
||||
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
|
||||
}
|
||||
fprintf(stderr, "'\n");
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
if (params.interactive) {
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||
struct sigaction sigint_action;
|
||||
sigint_action.sa_handler = sigint_handler;
|
||||
sigemptyset (&sigint_action.sa_mask);
|
||||
sigint_action.sa_flags = 0;
|
||||
sigaction(SIGINT, &sigint_action, NULL);
|
||||
#elif defined (_WIN32)
|
||||
signal(SIGINT, sigint_handler);
|
||||
#endif
|
||||
|
||||
fprintf(stderr, "%s: interactive mode on.\n", __func__);
|
||||
|
||||
if (params.antiprompt.size()) {
|
||||
for (auto antiprompt : params.antiprompt) {
|
||||
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if (!params.input_prefix.empty()) {
|
||||
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
|
||||
params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
|
||||
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||
fprintf(stderr, "\n\n");
|
||||
|
||||
// TODO: replace with ring-buffer
|
||||
std::vector<llama_token> last_n_tokens(n_ctx);
|
||||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
||||
|
||||
if (params.interactive) {
|
||||
fprintf(stderr, "== Running in interactive mode. ==\n"
|
||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||
" - Press Ctrl+C to interject at any time.\n"
|
||||
#endif
|
||||
" - Press Return to return control to LLaMa.\n"
|
||||
" - If you want to submit another line, end your input in '\\'.\n\n");
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
|
||||
bool is_antiprompt = false;
|
||||
bool input_noecho = false;
|
||||
|
||||
int n_past = 0;
|
||||
int n_remain = params.n_predict;
|
||||
int n_consumed = 0;
|
||||
|
||||
// the first thing we will do is to output the prompt, so set color accordingly
|
||||
set_console_color(con_st, CONSOLE_COLOR_PROMPT);
|
||||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
while (n_remain != 0 || params.interactive) {
|
||||
// predict
|
||||
if (embd.size() > 0) {
|
||||
// infinite text generation via context swapping
|
||||
// if we run out of context:
|
||||
// - take the n_keep first tokens from the original prompt (via n_past)
|
||||
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
||||
if (n_past + (int) embd.size() > n_ctx) {
|
||||
const int n_left = n_past - params.n_keep;
|
||||
|
||||
n_past = params.n_keep;
|
||||
|
||||
// insert n_left/2 tokens at the start of embd from last_n_tokens
|
||||
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
|
||||
|
||||
//printf("\n---\n");
|
||||
//printf("resetting: '");
|
||||
//for (int i = 0; i < (int) embd.size(); i++) {
|
||||
// printf("%s", llama_token_to_str(ctx, embd[i]));
|
||||
//}
|
||||
//printf("'\n");
|
||||
//printf("\n---\n");
|
||||
}
|
||||
|
||||
// evaluate tokens in batches
|
||||
// embd is typically prepared beforehand to fit within a batch, but not always
|
||||
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
|
||||
int n_eval = (int) embd.size() - i;
|
||||
if (n_eval > params.n_batch) {
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
n_past += n_eval;
|
||||
}
|
||||
}
|
||||
|
||||
embd.clear();
|
||||
|
||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||
// out of user input, sample next token
|
||||
const int32_t top_k = params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float temp = params.temp;
|
||||
const float repeat_penalty = params.repeat_penalty;
|
||||
|
||||
llama_token id = 0;
|
||||
|
||||
{
|
||||
auto logits = llama_get_logits(ctx);
|
||||
|
||||
if (params.ignore_eos) {
|
||||
logits[llama_token_eos()] = 0;
|
||||
}
|
||||
|
||||
id = llama_sample_top_p_top_k(ctx,
|
||||
last_n_tokens.data() + n_ctx - params.repeat_last_n,
|
||||
params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
|
||||
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
last_n_tokens.push_back(id);
|
||||
}
|
||||
|
||||
// replace end of text token with newline token when in interactive mode
|
||||
if (id == llama_token_eos() && params.interactive && !params.instruct) {
|
||||
id = llama_token_newline.front();
|
||||
if (params.antiprompt.size() != 0) {
|
||||
// tokenize and inject first reverse prompt
|
||||
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false);
|
||||
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
||||
}
|
||||
}
|
||||
|
||||
// add it to the context
|
||||
embd.push_back(id);
|
||||
|
||||
// echo this to console
|
||||
input_noecho = false;
|
||||
|
||||
// decrement remaining sampling budget
|
||||
--n_remain;
|
||||
} else {
|
||||
// some user input remains from prompt or interaction, forward it to processing
|
||||
while ((int) embd_inp.size() > n_consumed) {
|
||||
embd.push_back(embd_inp[n_consumed]);
|
||||
last_n_tokens.erase(last_n_tokens.begin());
|
||||
last_n_tokens.push_back(embd_inp[n_consumed]);
|
||||
++n_consumed;
|
||||
if ((int) embd.size() >= params.n_batch) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// display text
|
||||
if (!input_noecho) {
|
||||
for (auto id : embd) {
|
||||
printf("%s", llama_token_to_str(ctx, id));
|
||||
}
|
||||
fflush(stdout);
|
||||
}
|
||||
// reset color to default if we there is no pending user input
|
||||
if (!input_noecho && (int)embd_inp.size() == n_consumed) {
|
||||
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
|
||||
}
|
||||
|
||||
// in interactive mode, and not currently processing queued inputs;
|
||||
// check if we should prompt the user for more
|
||||
if (params.interactive && (int) embd_inp.size() <= n_consumed) {
|
||||
|
||||
// check for reverse prompt
|
||||
if (params.antiprompt.size()) {
|
||||
std::string last_output;
|
||||
for (auto id : last_n_tokens) {
|
||||
last_output += llama_token_to_str(ctx, id);
|
||||
}
|
||||
|
||||
is_antiprompt = false;
|
||||
// Check if each of the reverse prompts appears at the end of the output.
|
||||
for (std::string & antiprompt : params.antiprompt) {
|
||||
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
|
||||
is_interacting = true;
|
||||
is_antiprompt = true;
|
||||
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
|
||||
fflush(stdout);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (n_past > 0 && is_interacting) {
|
||||
// potentially set color to indicate we are taking user input
|
||||
set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
|
||||
|
||||
#if defined (_WIN32)
|
||||
// Windows: must reactivate sigint handler after each signal
|
||||
signal(SIGINT, sigint_handler);
|
||||
#endif
|
||||
|
||||
if (params.instruct) {
|
||||
printf("\n> ");
|
||||
}
|
||||
|
||||
std::string buffer;
|
||||
if (!params.input_prefix.empty()) {
|
||||
buffer += params.input_prefix;
|
||||
printf("%s", buffer.c_str());
|
||||
}
|
||||
|
||||
std::string line;
|
||||
bool another_line = true;
|
||||
do {
|
||||
#if defined(_WIN32)
|
||||
std::wstring wline;
|
||||
if (!std::getline(std::wcin, wline)) {
|
||||
// input stream is bad or EOF received
|
||||
return 0;
|
||||
}
|
||||
win32_utf8_encode(wline, line);
|
||||
#else
|
||||
if (!std::getline(std::cin, line)) {
|
||||
// input stream is bad or EOF received
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
if (line.empty() || line.back() != '\\') {
|
||||
another_line = false;
|
||||
} else {
|
||||
line.pop_back(); // Remove the continue character
|
||||
}
|
||||
buffer += line + '\n'; // Append the line to the result
|
||||
} while (another_line);
|
||||
|
||||
// done taking input, reset color
|
||||
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
|
||||
|
||||
// Add tokens to embd only if the input buffer is non-empty
|
||||
// Entering a empty line lets the user pass control back
|
||||
if (buffer.length() > 1) {
|
||||
|
||||
// instruct mode: insert instruction prefix
|
||||
if (params.instruct && !is_antiprompt) {
|
||||
n_consumed = embd_inp.size();
|
||||
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||
}
|
||||
|
||||
auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||
|
||||
// instruct mode: insert response suffix
|
||||
if (params.instruct) {
|
||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||
}
|
||||
|
||||
n_remain -= line_inp.size();
|
||||
}
|
||||
|
||||
input_noecho = true; // do not echo this again
|
||||
}
|
||||
|
||||
if (n_past > 0) {
|
||||
is_interacting = false;
|
||||
}
|
||||
}
|
||||
|
||||
// end of text token
|
||||
if (!embd.empty() && embd.back() == llama_token_eos()) {
|
||||
if (params.instruct) {
|
||||
is_interacting = true;
|
||||
} else {
|
||||
fprintf(stderr, " [end of text]\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||
if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
|
||||
n_remain = params.n_predict;
|
||||
is_interacting = true;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined (_WIN32)
|
||||
signal(SIGINT, SIG_DFL);
|
||||
#endif
|
||||
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
|
||||
set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
|
||||
|
||||
return 0;
|
||||
}
|
4
third_party/libcxx/__hash_table
vendored
4
third_party/libcxx/__hash_table
vendored
|
@ -1066,7 +1066,7 @@ public:
|
|||
|
||||
#ifndef _LIBCPP_CXX03_LANG
|
||||
template <class _Key, class ..._Args>
|
||||
_LIBCPP_INLINE_VISIBILITY
|
||||
inline _LIBCPP_INLINE_VISIBILITY
|
||||
pair<iterator, bool> __emplace_unique_key_args(_Key const& __k, _Args&&... __args);
|
||||
|
||||
template <class... _Args>
|
||||
|
@ -2104,7 +2104,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_unique_key_args(_Key const&
|
|||
size_type __bc = bucket_count();
|
||||
bool __inserted = false;
|
||||
__next_pointer __nd;
|
||||
size_t __chash;
|
||||
size_t __chash = 0;
|
||||
if (__bc != 0)
|
||||
{
|
||||
__chash = __constrain_hash(__hash, __bc);
|
||||
|
|
1
third_party/third_party.mk
vendored
1
third_party/third_party.mk
vendored
|
@ -13,6 +13,7 @@ o/$(MODE)/third_party: \
|
|||
o/$(MODE)/third_party/finger \
|
||||
o/$(MODE)/third_party/gdtoa \
|
||||
o/$(MODE)/third_party/getopt \
|
||||
o/$(MODE)/third_party/ggml \
|
||||
o/$(MODE)/third_party/hiredis \
|
||||
o/$(MODE)/third_party/libcxx \
|
||||
o/$(MODE)/third_party/linenoise \
|
||||
|
|
Loading…
Reference in a new issue