mirror of
				https://github.com/jart/cosmopolitan.git
				synced 2025-10-26 11:10:58 +00:00 
			
		
		
		
	Import llama.cpp
https://github.com/ggerganov/llama.cpp 0b2da20538d01926b77ea237dd1c930c4d20b686 See third_party/ggml/README.cosmo for changes
This commit is contained in:
		
							parent
							
								
									f42089d5c6
								
							
						
					
					
						commit
						e8b43903b2
					
				
					 14 changed files with 18313 additions and 2 deletions
				
			
		
							
								
								
									
										1
									
								
								Makefile
									
										
									
									
									
								
							
							
						
						
									
										1
									
								
								Makefile
									
										
									
									
									
								
							|  | @ -144,6 +144,7 @@ include libc/stdio/stdio.mk			# │ | |||
| include third_party/libcxx/libcxx.mk		# │
 | ||||
| include net/net.mk				# │
 | ||||
| include third_party/vqsort/vqsort.mk		# │
 | ||||
| include third_party/ggml/ggml.mk		# │
 | ||||
| include libc/log/log.mk				# │
 | ||||
| include third_party/bzip2/bzip2.mk		# │
 | ||||
| include dsp/core/core.mk			# │
 | ||||
|  |  | |||
							
								
								
									
										21
									
								
								third_party/ggml/LICENSE
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								third_party/ggml/LICENSE
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,21 @@ | |||
| MIT License | ||||
| 
 | ||||
| Copyright (c) 2023 Georgi Gerganov | ||||
| 
 | ||||
| Permission is hereby granted, free of charge, to any person obtaining a copy | ||||
| of this software and associated documentation files (the "Software"), to deal | ||||
| in the Software without restriction, including without limitation the rights | ||||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||||
| copies of the Software, and to permit persons to whom the Software is | ||||
| furnished to do so, subject to the following conditions: | ||||
| 
 | ||||
| The above copyright notice and this permission notice shall be included in all | ||||
| copies or substantial portions of the Software. | ||||
| 
 | ||||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||
| SOFTWARE. | ||||
							
								
								
									
										21
									
								
								third_party/ggml/README.cosmo
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								third_party/ggml/README.cosmo
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,21 @@ | |||
| DESCRIPTION | ||||
| 
 | ||||
|   ggml is a machine learning library useful for LLM inference on CPUs | ||||
| 
 | ||||
| LICENSE | ||||
| 
 | ||||
|   MIT | ||||
| 
 | ||||
| ORIGIN | ||||
| 
 | ||||
|   https://github.com/ggerganov/llama.cpp | ||||
|   commit 0b2da20538d01926b77ea237dd1c930c4d20b686 | ||||
|   Author: Stephan Walter <stephan@walter.name> | ||||
|   Date:   Wed Apr 26 20:26:42 2023 +0000 | ||||
|   ggml : slightly faster AVX2 implementation for Q5 (#1197) | ||||
| 
 | ||||
| LOCAL CHANGES | ||||
| 
 | ||||
|   - Refactor headers per cosmo convention | ||||
|   - Replace code like 'ggjt' with READ32BE("ggjt") | ||||
|   - Remove C++ exceptions; use Die() function instead | ||||
							
								
								
									
										385
									
								
								third_party/ggml/common.cc
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										385
									
								
								third_party/ggml/common.cc
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,385 @@ | |||
| /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
 | ||||
| │vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  llama.cpp                                                                   │ | ||||
| │  Copyright (c) 2023 Georgi Gerganov                                          │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| 
 | ||||
| asm(".ident\t\"\\n\\n\
 | ||||
| llama.cpp (MIT License)\\n\ | ||||
| Copyright (c) 2023 Georgi Gerganov\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| // clang-format off
 | ||||
| 
 | ||||
| #include "third_party/ggml/common.h" | ||||
| 
 | ||||
| #include "third_party/libcxx/cassert" | ||||
| #include "third_party/libcxx/cstring" | ||||
| #include "third_party/libcxx/fstream" | ||||
| #include "third_party/libcxx/string" | ||||
| #include "third_party/libcxx/iterator" | ||||
| #include "third_party/libcxx/algorithm" | ||||
| 
 | ||||
| #if defined (_WIN32) | ||||
| #include "libc/calls/calls.h" | ||||
| #include "libc/calls/struct/flock.h" | ||||
| #include "libc/calls/weirdtypes.h" | ||||
| #include "libc/sysv/consts/at.h" | ||||
| #include "libc/sysv/consts/f.h" | ||||
| #include "libc/sysv/consts/fd.h" | ||||
| #include "libc/sysv/consts/o.h" | ||||
| #include "libc/sysv/consts/posix.h" | ||||
| #include "libc/sysv/consts/s.h" | ||||
| // MISSING #include <io.h>
 | ||||
| #pragma comment(lib,"kernel32.lib") | ||||
| extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle); | ||||
| extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode); | ||||
| extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode); | ||||
| extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID); | ||||
| extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID); | ||||
| extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags, | ||||
|                                                                    const wchar_t * lpWideCharStr, int cchWideChar, | ||||
|                                                                    char * lpMultiByteStr, int cbMultiByte, | ||||
|                                                                    const char * lpDefaultChar, bool * lpUsedDefaultChar); | ||||
| #define CP_UTF8 65001 | ||||
| #endif | ||||
| 
 | ||||
| bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { | ||||
|     // determine sensible default number of threads.
 | ||||
|     // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
 | ||||
| #ifdef __linux__ | ||||
|     std::ifstream cpuinfo("/proc/cpuinfo"); | ||||
|     params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo), | ||||
|                                   std::istream_iterator<std::string>(), | ||||
|                                   std::string("processor")); | ||||
| #endif | ||||
|     if (params.n_threads == 0) { | ||||
|         params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency()); | ||||
|     } | ||||
| 
 | ||||
|     bool invalid_param = false; | ||||
|     std::string arg; | ||||
|     gpt_params default_params; | ||||
| 
 | ||||
|     for (int i = 1; i < argc; i++) { | ||||
|         arg = argv[i]; | ||||
| 
 | ||||
|         if (arg == "-s" || arg == "--seed") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.seed = std::stoi(argv[i]); | ||||
|         } else if (arg == "-t" || arg == "--threads") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.n_threads = std::stoi(argv[i]); | ||||
|         } else if (arg == "-p" || arg == "--prompt") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.prompt = argv[i]; | ||||
|         } else if (arg == "-f" || arg == "--file") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             std::ifstream file(argv[i]); | ||||
|             if (!file) { | ||||
|                 fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt)); | ||||
|             if (params.prompt.back() == '\n') { | ||||
|                 params.prompt.pop_back(); | ||||
|             } | ||||
|         } else if (arg == "-n" || arg == "--n_predict") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.n_predict = std::stoi(argv[i]); | ||||
|         } else if (arg == "--top_k") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.top_k = std::stoi(argv[i]); | ||||
|         } else if (arg == "-c" || arg == "--ctx_size") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.n_ctx = std::stoi(argv[i]); | ||||
|         } else if (arg == "--memory_f32") { | ||||
|             params.memory_f16 = false; | ||||
|         } else if (arg == "--top_p") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.top_p = std::stof(argv[i]); | ||||
|         } else if (arg == "--temp") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.temp = std::stof(argv[i]); | ||||
|         } else if (arg == "--repeat_last_n") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.repeat_last_n = std::stoi(argv[i]); | ||||
|         } else if (arg == "--repeat_penalty") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.repeat_penalty = std::stof(argv[i]); | ||||
|         } else if (arg == "-b" || arg == "--batch_size") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.n_batch = std::stoi(argv[i]); | ||||
|             params.n_batch = std::min(512, params.n_batch); | ||||
|         } else if (arg == "--keep") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.n_keep = std::stoi(argv[i]); | ||||
|         } else if (arg == "-m" || arg == "--model") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.model = argv[i]; | ||||
|         } else if (arg == "--lora") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.lora_adapter = argv[i]; | ||||
|             params.use_mmap = false; | ||||
|         } else if (arg == "--lora-base") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.lora_base = argv[i]; | ||||
|         } else if (arg == "-i" || arg == "--interactive") { | ||||
|             params.interactive = true; | ||||
|         } else if (arg == "--embedding") { | ||||
|             params.embedding = true; | ||||
|         } else if (arg == "--interactive-first") { | ||||
|             params.interactive_first = true; | ||||
|         } else if (arg == "-ins" || arg == "--instruct") { | ||||
|             params.instruct = true; | ||||
|         } else if (arg == "--color") { | ||||
|             params.use_color = true; | ||||
|         } else if (arg == "--mlock") { | ||||
|             params.use_mlock = true; | ||||
|         } else if (arg == "--no-mmap") { | ||||
|             params.use_mmap = false; | ||||
|         } else if (arg == "--mtest") { | ||||
|             params.mem_test = true; | ||||
|         } else if (arg == "--verbose-prompt") { | ||||
|             params.verbose_prompt = true; | ||||
|         } else if (arg == "-r" || arg == "--reverse-prompt") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.antiprompt.push_back(argv[i]); | ||||
|         } else if (arg == "--perplexity") { | ||||
|             params.perplexity = true; | ||||
|         } else if (arg == "--ignore-eos") { | ||||
|             params.ignore_eos = true; | ||||
|         } else if (arg == "--n_parts") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.n_parts = std::stoi(argv[i]); | ||||
|         } else if (arg == "-h" || arg == "--help") { | ||||
|             gpt_print_usage(argc, argv, default_params); | ||||
|             exit(0); | ||||
|         } else if (arg == "--random-prompt") { | ||||
|             params.random_prompt = true; | ||||
|         } else if (arg == "--in-prefix") { | ||||
|             if (++i >= argc) { | ||||
|                 invalid_param = true; | ||||
|                 break; | ||||
|             } | ||||
|             params.input_prefix = argv[i]; | ||||
|         } else { | ||||
|             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); | ||||
|             gpt_print_usage(argc, argv, default_params); | ||||
|             exit(1); | ||||
|         } | ||||
|     } | ||||
|     if (invalid_param) { | ||||
|         fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); | ||||
|         gpt_print_usage(argc, argv, default_params); | ||||
|         exit(1); | ||||
|     } | ||||
| 
 | ||||
|     return true; | ||||
| } | ||||
| 
 | ||||
| void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { | ||||
|     fprintf(stderr, "usage: %s [options]\n", argv[0]); | ||||
|     fprintf(stderr, "\n"); | ||||
|     fprintf(stderr, "options:\n"); | ||||
|     fprintf(stderr, "  -h, --help            show this help message and exit\n"); | ||||
|     fprintf(stderr, "  -i, --interactive     run in interactive mode\n"); | ||||
|     fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n"); | ||||
|     fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n"); | ||||
|     fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n"); | ||||
|     fprintf(stderr, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n"); | ||||
|     fprintf(stderr, "                        specified more than once for multiple prompts).\n"); | ||||
|     fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n"); | ||||
|     fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1, use random seed for <= 0)\n"); | ||||
|     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads); | ||||
|     fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n"); | ||||
|     fprintf(stderr, "                        prompt to start generation with (default: empty)\n"); | ||||
|     fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n"); | ||||
|     fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n"); | ||||
|     fprintf(stderr, "  -f FNAME, --file FNAME\n"); | ||||
|     fprintf(stderr, "                        prompt file to start generation.\n"); | ||||
|     fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); | ||||
|     fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k); | ||||
|     fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", (double)params.top_p); | ||||
|     fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n); | ||||
|     fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty); | ||||
|     fprintf(stderr, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx); | ||||
|     fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating\n"); | ||||
|     fprintf(stderr, "  --memory_f32          use f32 instead of f16 for memory key+value\n"); | ||||
|     fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp); | ||||
|     fprintf(stderr, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n"); | ||||
|     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch); | ||||
|     fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n"); | ||||
|     fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); | ||||
|     if (llama_mlock_supported()) { | ||||
|         fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n"); | ||||
|     } | ||||
|     if (llama_mmap_supported()) { | ||||
|         fprintf(stderr, "  --no-mmap             do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); | ||||
|     } | ||||
|     fprintf(stderr, "  --mtest               compute maximum memory usage\n"); | ||||
|     fprintf(stderr, "  --verbose-prompt      print prompt before generation\n"); | ||||
|     fprintf(stderr, "  --lora FNAME          apply LoRA adapter (implies --no-mmap)\n"); | ||||
|     fprintf(stderr, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n"); | ||||
|     fprintf(stderr, "  -m FNAME, --model FNAME\n"); | ||||
|     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str()); | ||||
|     fprintf(stderr, "\n"); | ||||
| } | ||||
| 
 | ||||
| std::string gpt_random_prompt(std::mt19937 & rng) { | ||||
|     const int r = rng() % 10; | ||||
|     switch (r) { | ||||
|         case 0: return "So"; | ||||
|         case 1: return "Once upon a time"; | ||||
|         case 2: return "When"; | ||||
|         case 3: return "The"; | ||||
|         case 4: return "After"; | ||||
|         case 5: return "If"; | ||||
|         case 6: return "import"; | ||||
|         case 7: return "He"; | ||||
|         case 8: return "She"; | ||||
|         case 9: return "They"; | ||||
|         default: return "To"; | ||||
|     } | ||||
| 
 | ||||
|     return "The"; | ||||
| } | ||||
| 
 | ||||
| // TODO: not great allocating this every time
 | ||||
| std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) { | ||||
|     // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
 | ||||
|     std::vector<llama_token> res(text.size() + (int)add_bos); | ||||
|     int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); | ||||
|     assert(n >= 0); | ||||
|     res.resize(n); | ||||
| 
 | ||||
|     return res; | ||||
| } | ||||
| 
 | ||||
| /* Keep track of current color of output, and emit ANSI code if it changes. */ | ||||
| void set_console_color(console_state & con_st, console_color_t color) { | ||||
|     if (con_st.use_color && con_st.color != color) { | ||||
|         switch(color) { | ||||
|             case CONSOLE_COLOR_DEFAULT: | ||||
|                 printf(ANSI_COLOR_RESET); | ||||
|                 break; | ||||
|             case CONSOLE_COLOR_PROMPT: | ||||
|                 printf(ANSI_COLOR_YELLOW); | ||||
|                 break; | ||||
|             case CONSOLE_COLOR_USER_INPUT: | ||||
|                 printf(ANSI_BOLD ANSI_COLOR_GREEN); | ||||
|                 break; | ||||
|         } | ||||
|         con_st.color = color; | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| #if defined (_WIN32) | ||||
| void win32_console_init(bool enable_color) { | ||||
|     unsigned long dwMode = 0; | ||||
|     void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
 | ||||
|     if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) { | ||||
|         hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12)
 | ||||
|         if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) { | ||||
|             hConOut = 0; | ||||
|         } | ||||
|     } | ||||
|     if (hConOut) { | ||||
|         // Enable ANSI colors on Windows 10+
 | ||||
|         if (enable_color && !(dwMode & 0x4)) { | ||||
|             SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
 | ||||
|         } | ||||
|         // Set console output codepage to UTF8
 | ||||
|         SetConsoleOutputCP(CP_UTF8); | ||||
|     } | ||||
|     void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
 | ||||
|     if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) { | ||||
|         // Set console input codepage to UTF16
 | ||||
|         _setmode(_fileno(stdin), _O_WTEXT); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| // Convert a wide Unicode string to an UTF8 string
 | ||||
| void win32_utf8_encode(const std::wstring & wstr, std::string & str) { | ||||
|     int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL); | ||||
|     std::string strTo(size_needed, 0); | ||||
|     WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL); | ||||
|     str = strTo; | ||||
| } | ||||
| #endif | ||||
							
								
								
									
										103
									
								
								third_party/ggml/common.h
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										103
									
								
								third_party/ggml/common.h
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,103 @@ | |||
| // -*- c++ -*-
 | ||||
| // clang-format off
 | ||||
| #ifndef COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_ | ||||
| #define COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_ | ||||
| #include "third_party/ggml/llama.h" | ||||
| #include "third_party/libcxx/string" | ||||
| #include "third_party/libcxx/vector" | ||||
| #include "third_party/libcxx/random" | ||||
| #include "third_party/libcxx/thread" | ||||
| #if !(__ASSEMBLER__ + __LINKER__ + 0) | ||||
| // clang-format off
 | ||||
| // Various helper functions and utilities
 | ||||
| 
 | ||||
| //
 | ||||
| // CLI argument parsing
 | ||||
| //
 | ||||
| 
 | ||||
| struct gpt_params { | ||||
|     int32_t seed          = -1;   // RNG seed
 | ||||
|     int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency()); | ||||
|     int32_t n_predict     = 128;  // new tokens to predict
 | ||||
|     int32_t repeat_last_n = 64;   // last n tokens to penalize
 | ||||
|     int32_t n_parts       = -1;   // amount of model parts (-1 = determine from model dimensions)
 | ||||
|     int32_t n_ctx         = 512;  // context size
 | ||||
|     int32_t n_batch       = 512;  // batch size for prompt processing (must be >=32 to use BLAS)
 | ||||
|     int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
 | ||||
| 
 | ||||
|     // sampling parameters
 | ||||
|     int32_t top_k = 40; | ||||
|     float   top_p = 0.95f; | ||||
|     float   temp  = 0.80f; | ||||
|     float   repeat_penalty  = 1.10f; | ||||
| 
 | ||||
|     std::string model  = "models/lamma-7B/ggml-model.bin"; // model path
 | ||||
|     std::string prompt = ""; | ||||
|     std::string input_prefix = "";       // string to prefix user inputs with
 | ||||
|     std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
 | ||||
| 
 | ||||
|     std::string lora_adapter = "";  // lora adapter path
 | ||||
|     std::string lora_base = "";     // base model path for the lora adapter
 | ||||
| 
 | ||||
|     bool memory_f16        = true;  // use f16 instead of f32 for memory kv
 | ||||
|     bool random_prompt     = false; // do not randomize prompt if none provided
 | ||||
|     bool use_color         = false; // use color to distinguish generations and inputs
 | ||||
|     bool interactive       = false; // interactive mode
 | ||||
| 
 | ||||
|     bool embedding         = false; // get only sentence embedding
 | ||||
|     bool interactive_first = false; // wait for user input immediately
 | ||||
| 
 | ||||
|     bool instruct          = false; // instruction mode (used for Alpaca models)
 | ||||
|     bool ignore_eos        = false; // do not stop generating after eos
 | ||||
|     bool perplexity        = false; // compute perplexity over the prompt
 | ||||
|     bool use_mmap          = true;  // use mmap for faster loads
 | ||||
|     bool use_mlock         = false; // use mlock to keep model in memory
 | ||||
|     bool mem_test          = false; // compute maximum memory usage
 | ||||
|     bool verbose_prompt    = false; // print prompt tokens before generation
 | ||||
| }; | ||||
| 
 | ||||
| bool gpt_params_parse(int argc, char ** argv, gpt_params & params); | ||||
| 
 | ||||
| void gpt_print_usage(int argc, char ** argv, const gpt_params & params); | ||||
| 
 | ||||
| std::string gpt_random_prompt(std::mt19937 & rng); | ||||
| 
 | ||||
| //
 | ||||
| // Vocab utils
 | ||||
| //
 | ||||
| 
 | ||||
| std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); | ||||
| 
 | ||||
| //
 | ||||
| // Console utils
 | ||||
| //
 | ||||
| 
 | ||||
| #define ANSI_COLOR_RED     "\x1b[31m" | ||||
| #define ANSI_COLOR_GREEN   "\x1b[32m" | ||||
| #define ANSI_COLOR_YELLOW  "\x1b[33m" | ||||
| #define ANSI_COLOR_BLUE    "\x1b[34m" | ||||
| #define ANSI_COLOR_MAGENTA "\x1b[35m" | ||||
| #define ANSI_COLOR_CYAN    "\x1b[36m" | ||||
| #define ANSI_COLOR_RESET   "\x1b[0m" | ||||
| #define ANSI_BOLD          "\x1b[1m" | ||||
| 
 | ||||
| enum console_color_t { | ||||
|     CONSOLE_COLOR_DEFAULT=0, | ||||
|     CONSOLE_COLOR_PROMPT, | ||||
|     CONSOLE_COLOR_USER_INPUT | ||||
| }; | ||||
| 
 | ||||
| struct console_state { | ||||
|     bool use_color = false; | ||||
|     console_color_t color = CONSOLE_COLOR_DEFAULT; | ||||
| }; | ||||
| 
 | ||||
| void set_console_color(console_state & con_st, console_color_t color); | ||||
| 
 | ||||
| #if defined (_WIN32) | ||||
| void win32_console_init(bool enable_color); | ||||
| void win32_utf8_encode(const std::wstring & wstr, std::string & str); | ||||
| #endif | ||||
| 
 | ||||
| #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ | ||||
| #endif /* COSMOPOLITAN_THIRD_PARTY_GGML_COMMON_H_ */ | ||||
							
								
								
									
										13137
									
								
								third_party/ggml/ggml.c
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										13137
									
								
								third_party/ggml/ggml.c
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										889
									
								
								third_party/ggml/ggml.h
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										889
									
								
								third_party/ggml/ggml.h
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,889 @@ | |||
| // clang-format off
 | ||||
| #ifndef COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_ | ||||
| #define COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_ | ||||
| #if !(__ASSEMBLER__ + __LINKER__ + 0) | ||||
| COSMOPOLITAN_C_START_ | ||||
| // clang-format off
 | ||||
| 
 | ||||
| //
 | ||||
| // GGML Tensor Library
 | ||||
| //
 | ||||
| // This documentation is still a work in progress.
 | ||||
| // If you wish some specific topics to be covered, feel free to drop a comment:
 | ||||
| //
 | ||||
| //   https://github.com/ggerganov/whisper.cpp/issues/40
 | ||||
| //
 | ||||
| // ## Overview
 | ||||
| //
 | ||||
| // This library implements:
 | ||||
| //
 | ||||
| //  - a set of tensor operations
 | ||||
| //  - automatic differentiation
 | ||||
| //  - basic optimization algorithms
 | ||||
| //
 | ||||
| // The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
 | ||||
| // but is not limited to, the following:
 | ||||
| //
 | ||||
| //  - linear regression
 | ||||
| //  - support vector machines
 | ||||
| //  - neural networks
 | ||||
| //
 | ||||
| // The library allows the user to define a certain function using the available tensor operations. This function
 | ||||
| // definition is represented internally via a computation graph. Each tensor operation in the function definition
 | ||||
| // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
 | ||||
| // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
 | ||||
| // using one of the available optimization algorithms.
 | ||||
| //
 | ||||
| // For example, here we define the function: f(x) = a*x^2 + b
 | ||||
| //
 | ||||
| //   {
 | ||||
| //       struct ggml_init_params params = {
 | ||||
| //           .mem_size   = 16*1024*1024,
 | ||||
| //           .mem_buffer = NULL,
 | ||||
| //       };
 | ||||
| //
 | ||||
| //       // memory allocation happens here
 | ||||
| //       struct ggml_context * ctx = ggml_init(params);
 | ||||
| //
 | ||||
| //       struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
 | ||||
| //
 | ||||
| //       ggml_set_param(ctx, x); // x is an input variable
 | ||||
| //
 | ||||
| //       struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
 | ||||
| //       struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
 | ||||
| //       struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
 | ||||
| //       struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
 | ||||
| //
 | ||||
| //       ...
 | ||||
| //   }
 | ||||
| //
 | ||||
| // Notice that the function definition above does not involve any actual computation. The computation is performed only
 | ||||
| // when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
 | ||||
| //
 | ||||
| //   {
 | ||||
| //       ...
 | ||||
| //
 | ||||
| //       struct ggml_cgraph gf = ggml_build_forward(f);
 | ||||
| //
 | ||||
| //       // set the input variable and parameter values
 | ||||
| //       ggml_set_f32(x, 2.0f);
 | ||||
| //       ggml_set_f32(a, 3.0f);
 | ||||
| //       ggml_set_f32(b, 4.0f);
 | ||||
| //
 | ||||
| //       ggml_graph_compute(ctx0, &gf);
 | ||||
| //
 | ||||
| //       printf("f = %f\n", ggml_get_f32_1d(f, 0));
 | ||||
| //
 | ||||
| //       ...
 | ||||
| //   }
 | ||||
| //
 | ||||
| // The actual computation is performed in the ggml_graph_compute() function.
 | ||||
| //
 | ||||
| // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
 | ||||
| // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
 | ||||
| // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
 | ||||
| // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
 | ||||
| // actually needed.
 | ||||
| //
 | ||||
| // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
 | ||||
| // differentiation and optimization algorithms.
 | ||||
| //
 | ||||
| // The described approach allows to define the function graph once and then compute its forward or backward graphs
 | ||||
| // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
 | ||||
| // the user can avoid the memory allocation overhead at runtime.
 | ||||
| //
 | ||||
| // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
 | ||||
| // citizens, but in theory the library can be extended to support FP8 and integer data types.
 | ||||
| //
 | ||||
| // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
 | ||||
| // and binary operations. Most of the available operations fall into one of these two categories. With time, it became
 | ||||
| // clear that the library needs to support more complex operations. The way to support these operations is not clear
 | ||||
| // yet, but a few examples are demonstrated in the following operations:
 | ||||
| //
 | ||||
| //   - ggml_permute()
 | ||||
| //   - ggml_conv_1d_1s()
 | ||||
| //   - ggml_conv_1d_2s()
 | ||||
| //
 | ||||
| // For each tensor operator, the library implements a forward and backward computation function. The forward function
 | ||||
| // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
 | ||||
| // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
 | ||||
| // calculus class, or watch the following video:
 | ||||
| //
 | ||||
| //   What is Automatic Differentiation?
 | ||||
| //   https://www.youtube.com/watch?v=wG_nF1awSSY
 | ||||
| //
 | ||||
| //
 | ||||
| // ## Tensor data (struct ggml_tensor)
 | ||||
| //
 | ||||
| // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
 | ||||
| // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
 | ||||
| // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
 | ||||
| //
 | ||||
| //   {
 | ||||
| //       struct ggml_tensor * c = ggml_add(ctx, a, b);
 | ||||
| //
 | ||||
| //       assert(c->src[0] == a);
 | ||||
| //       assert(c->src[1] == b);
 | ||||
| //   }
 | ||||
| //
 | ||||
| // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
 | ||||
| // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
 | ||||
| // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
 | ||||
| // permutation. All tensor operations have to take the stride into account and not assume that the tensor is
 | ||||
| // contiguous in memory.
 | ||||
| //
 | ||||
| // The data of the tensor is accessed via the "data" pointer. For example:
 | ||||
| //
 | ||||
| //   {
 | ||||
| //       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
 | ||||
| //
 | ||||
| //       // a[1, 2] = 1.0f;
 | ||||
| //       *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
 | ||||
| //
 | ||||
| //       // a[2, 0] = 2.0f;
 | ||||
| //       *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
 | ||||
| //
 | ||||
| //       ...
 | ||||
| //   }
 | ||||
| //
 | ||||
| // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
 | ||||
| //
 | ||||
| // ## The matrix multiplication operator (ggml_mul_mat)
 | ||||
| //
 | ||||
| // TODO
 | ||||
| //
 | ||||
| //
 | ||||
| // ## Multi-threading
 | ||||
| //
 | ||||
| // TODO
 | ||||
| //
 | ||||
| //
 | ||||
| // ## Overview of ggml.c
 | ||||
| //
 | ||||
| // TODO
 | ||||
| //
 | ||||
| //
 | ||||
| // ## SIMD optimizations
 | ||||
| //
 | ||||
| // TODO
 | ||||
| //
 | ||||
| //
 | ||||
| // ## Debugging ggml
 | ||||
| //
 | ||||
| // TODO
 | ||||
| //
 | ||||
| //
 | ||||
| 
 | ||||
| #ifdef GGML_SHARED | ||||
| #    if defined(_WIN32) && !defined(__MINGW32__) | ||||
| #        ifdef GGML_BUILD | ||||
| #            define GGML_API __declspec(dllexport) | ||||
| #        else | ||||
| #            define GGML_API __declspec(dllimport) | ||||
| #        endif | ||||
| #    else | ||||
| #        define GGML_API __attribute__ ((visibility ("default"))) | ||||
| #    endif | ||||
| #else | ||||
| #    define GGML_API | ||||
| #endif | ||||
| 
 | ||||
| #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 | ||||
| #define GGML_FILE_VERSION 1 | ||||
| 
 | ||||
| #define GGML_MAX_DIMS          4 | ||||
| #define GGML_MAX_NODES         4096 | ||||
| #define GGML_MAX_PARAMS        16 | ||||
| #define GGML_MAX_CONTEXTS      64 | ||||
| #define GGML_MAX_OPT           4 | ||||
| #define GGML_DEFAULT_N_THREADS 4 | ||||
| 
 | ||||
| #ifdef __ARM_NEON | ||||
|     // we use the built-in 16-bit float type
 | ||||
|     typedef __fp16 ggml_fp16_t; | ||||
| #else | ||||
|     typedef uint16_t ggml_fp16_t; | ||||
| #endif | ||||
| 
 | ||||
|     // convert FP16 <-> FP32
 | ||||
|     GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x); | ||||
|     GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x); | ||||
| 
 | ||||
|     struct ggml_object; | ||||
|     struct ggml_context; | ||||
| 
 | ||||
|     enum ggml_type { | ||||
|         GGML_TYPE_F32  = 0, | ||||
|         GGML_TYPE_F16  = 1, | ||||
|         GGML_TYPE_Q4_0 = 2, | ||||
|         GGML_TYPE_Q4_1 = 3, | ||||
|         GGML_TYPE_Q4_2 = 4, | ||||
|         GGML_TYPE_Q4_3 = 5, | ||||
|         GGML_TYPE_Q5_0 = 6, | ||||
|         GGML_TYPE_Q5_1 = 7, | ||||
|         GGML_TYPE_Q8_0 = 8, | ||||
|         GGML_TYPE_Q8_1 = 9, | ||||
|         GGML_TYPE_I8, | ||||
|         GGML_TYPE_I16, | ||||
|         GGML_TYPE_I32, | ||||
|         GGML_TYPE_COUNT, | ||||
|     }; | ||||
| 
 | ||||
|     // available tensor operations:
 | ||||
|     enum ggml_op { | ||||
|         GGML_OP_NONE = 0, | ||||
| 
 | ||||
|         GGML_OP_DUP, | ||||
|         GGML_OP_ADD, | ||||
|         GGML_OP_SUB, | ||||
|         GGML_OP_MUL, | ||||
|         GGML_OP_DIV, | ||||
|         GGML_OP_SQR, | ||||
|         GGML_OP_SQRT, | ||||
|         GGML_OP_SUM, | ||||
|         GGML_OP_MEAN, | ||||
|         GGML_OP_REPEAT, | ||||
|         GGML_OP_ABS, | ||||
|         GGML_OP_SGN, | ||||
|         GGML_OP_NEG, | ||||
|         GGML_OP_STEP, | ||||
|         GGML_OP_RELU, | ||||
|         GGML_OP_GELU, | ||||
|         GGML_OP_SILU, | ||||
|         GGML_OP_NORM, // normalize
 | ||||
|         GGML_OP_RMS_NORM, | ||||
| 
 | ||||
|         GGML_OP_MUL_MAT, | ||||
| 
 | ||||
|         GGML_OP_SCALE, | ||||
|         GGML_OP_CPY, | ||||
|         GGML_OP_CONT, | ||||
|         GGML_OP_RESHAPE, | ||||
|         GGML_OP_VIEW, | ||||
|         GGML_OP_PERMUTE, | ||||
|         GGML_OP_TRANSPOSE, | ||||
|         GGML_OP_GET_ROWS, | ||||
|         GGML_OP_DIAG_MASK_INF, | ||||
|         GGML_OP_SOFT_MAX, | ||||
|         GGML_OP_ROPE, | ||||
|         GGML_OP_CONV_1D_1S, | ||||
|         GGML_OP_CONV_1D_2S, | ||||
| 
 | ||||
|         GGML_OP_FLASH_ATTN, | ||||
|         GGML_OP_FLASH_FF, | ||||
| 
 | ||||
|         GGML_OP_MAP_UNARY, | ||||
|         GGML_OP_MAP_BINARY, | ||||
| 
 | ||||
|         GGML_OP_COUNT, | ||||
|     }; | ||||
| 
 | ||||
| 
 | ||||
|     // ggml object
 | ||||
|     struct ggml_object { | ||||
|         size_t offs; | ||||
|         size_t size; | ||||
| 
 | ||||
|         struct ggml_object * next; | ||||
| 
 | ||||
|         char padding[8]; | ||||
|     }; | ||||
| 
 | ||||
|     static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object); | ||||
| 
 | ||||
|     // n-dimensional tensor
 | ||||
|     struct ggml_tensor { | ||||
|         enum ggml_type type; | ||||
| 
 | ||||
|         int     n_dims; | ||||
|         int64_t ne[GGML_MAX_DIMS]; // number of elements
 | ||||
|         size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
 | ||||
|                                    // nb[0] = sizeof(type)
 | ||||
|                                    // nb[1] = nb[0]   * ne[0] + padding
 | ||||
|                                    // nb[i] = nb[i-1] * ne[i-1]
 | ||||
| 
 | ||||
|         // compute data
 | ||||
|         enum ggml_op op; | ||||
| 
 | ||||
|         bool is_param; | ||||
| 
 | ||||
|         struct ggml_tensor * grad; | ||||
|         struct ggml_tensor * src0; | ||||
|         struct ggml_tensor * src1; | ||||
|         struct ggml_tensor * opt[GGML_MAX_OPT]; | ||||
| 
 | ||||
|         // thread scheduling
 | ||||
|         int n_tasks; | ||||
| 
 | ||||
|         // performance
 | ||||
|         int     perf_runs; | ||||
|         int64_t perf_cycles; | ||||
|         int64_t perf_time_us; | ||||
| 
 | ||||
|         void * data; | ||||
|         char padding[8]; | ||||
|     }; | ||||
| 
 | ||||
|     // computation graph
 | ||||
|     struct ggml_cgraph { | ||||
|         int n_nodes; | ||||
|         int n_leafs; | ||||
|         int n_threads; | ||||
| 
 | ||||
|         size_t work_size; | ||||
|         struct ggml_tensor * work; | ||||
| 
 | ||||
|         struct ggml_tensor * nodes[GGML_MAX_NODES]; | ||||
|         struct ggml_tensor * grads[GGML_MAX_NODES]; | ||||
|         struct ggml_tensor * leafs[GGML_MAX_NODES]; | ||||
| 
 | ||||
|         // performance
 | ||||
|         int     perf_runs; | ||||
|         int64_t perf_cycles; | ||||
|         int64_t perf_time_us; | ||||
|     }; | ||||
| 
 | ||||
|     // scratch buffer
 | ||||
|     struct ggml_scratch { | ||||
|         size_t offs; | ||||
|         size_t size; | ||||
|         void * data; | ||||
|     }; | ||||
| 
 | ||||
|     struct ggml_init_params { | ||||
|         // memory pool
 | ||||
|         size_t mem_size;   // bytes
 | ||||
|         void * mem_buffer; // if NULL, memory will be allocated internally
 | ||||
|         bool   no_alloc;   // don't allocate memory for the tensor data
 | ||||
|     }; | ||||
| 
 | ||||
|     // misc
 | ||||
| 
 | ||||
|     GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
 | ||||
|     GGML_API int64_t ggml_time_ms(void); | ||||
|     GGML_API int64_t ggml_time_us(void); | ||||
|     GGML_API int64_t ggml_cycles(void); | ||||
|     GGML_API int64_t ggml_cycles_per_ms(void); | ||||
| 
 | ||||
|     GGML_API void    ggml_print_object (const struct ggml_object * obj); | ||||
|     GGML_API void    ggml_print_objects(const struct ggml_context * ctx); | ||||
| 
 | ||||
|     GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor); | ||||
|     GGML_API size_t  ggml_nbytes   (const struct ggml_tensor * tensor); | ||||
| 
 | ||||
|     GGML_API int     ggml_blck_size (enum ggml_type type); | ||||
|     GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
 | ||||
|     GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
 | ||||
| 
 | ||||
|     GGML_API const char * ggml_type_name(enum ggml_type type); | ||||
| 
 | ||||
|     GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor); | ||||
| 
 | ||||
|     GGML_API bool    ggml_is_quantized(enum ggml_type type); | ||||
| 
 | ||||
|     // main
 | ||||
| 
 | ||||
|     GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); | ||||
|     GGML_API void    ggml_free(struct ggml_context * ctx); | ||||
| 
 | ||||
|     GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx); | ||||
| 
 | ||||
|     GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_new_tensor( | ||||
|             struct ggml_context * ctx, | ||||
|             enum   ggml_type type, | ||||
|             int    n_dims, | ||||
|             const int64_t *ne); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_new_tensor_1d( | ||||
|             struct ggml_context * ctx, | ||||
|             enum   ggml_type type, | ||||
|             int64_t ne0); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_new_tensor_2d( | ||||
|             struct ggml_context * ctx, | ||||
|             enum   ggml_type type, | ||||
|             int64_t ne0, | ||||
|             int64_t ne1); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_new_tensor_3d( | ||||
|             struct ggml_context * ctx, | ||||
|             enum   ggml_type type, | ||||
|             int64_t ne0, | ||||
|             int64_t ne1, | ||||
|             int64_t ne2); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_new_tensor_4d( | ||||
|             struct ggml_context * ctx, | ||||
|             enum   ggml_type type, | ||||
|             int64_t ne0, | ||||
|             int64_t ne1, | ||||
|             int64_t ne2, | ||||
|             int64_t ne3); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); | ||||
|     GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); | ||||
|     GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); | ||||
|     GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); | ||||
|     GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); | ||||
| 
 | ||||
|     GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); | ||||
|     GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); | ||||
| 
 | ||||
|     GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); | ||||
|     GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); | ||||
| 
 | ||||
|     GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor); | ||||
|     GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); | ||||
| 
 | ||||
|     //
 | ||||
|     // operations on tensors with backpropagation
 | ||||
|     //
 | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_dup( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_add( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_add_inplace( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_sub( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_mul( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_div( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_sqr( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_sqrt( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     // return scalar
 | ||||
|     // TODO: compute sum along rows
 | ||||
|     GGML_API struct ggml_tensor * ggml_sum( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     // mean along rows
 | ||||
|     GGML_API struct ggml_tensor * ggml_mean( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     // if a is the same shape as b, and a is not parameter, return a
 | ||||
|     // otherwise, return a new tensor: repeat(a) to fit in b
 | ||||
|     GGML_API struct ggml_tensor * ggml_repeat( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_abs( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_sgn( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_neg( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_step( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_relu( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     // TODO: double-check this computation is correct
 | ||||
|     GGML_API struct ggml_tensor * ggml_gelu( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_silu( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     // normalize along rows
 | ||||
|     // TODO: eps is hardcoded to 1e-5 for now
 | ||||
|     GGML_API struct ggml_tensor * ggml_norm( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_rms_norm( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     // A: m rows, n columns
 | ||||
|     // B: p rows, n columns (i.e. we transpose it internally)
 | ||||
|     // result is m columns, p rows
 | ||||
|     GGML_API struct ggml_tensor * ggml_mul_mat( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     //
 | ||||
|     // operations on tensors without backpropagation
 | ||||
|     //
 | ||||
| 
 | ||||
|     // in-place, returns view(a)
 | ||||
|     GGML_API struct ggml_tensor * ggml_scale( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     // a -> b, return view(b)
 | ||||
|     GGML_API struct ggml_tensor * ggml_cpy( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     // make contiguous
 | ||||
|     GGML_API struct ggml_tensor * ggml_cont( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     // return view(a), b specifies the new shape
 | ||||
|     // TODO: when we start computing gradient, make a copy instead of view
 | ||||
|     GGML_API struct ggml_tensor * ggml_reshape( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     // return view(a)
 | ||||
|     // TODO: when we start computing gradient, make a copy instead of view
 | ||||
|     GGML_API struct ggml_tensor * ggml_reshape_2d( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             int64_t               ne0, | ||||
|             int64_t               ne1); | ||||
| 
 | ||||
|     // return view(a)
 | ||||
|     // TODO: when we start computing gradient, make a copy instead of view
 | ||||
|     GGML_API struct ggml_tensor * ggml_reshape_3d( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             int64_t               ne0, | ||||
|             int64_t               ne1, | ||||
|             int64_t               ne2); | ||||
| 
 | ||||
|     // offset in bytes
 | ||||
|     GGML_API struct ggml_tensor * ggml_view_1d( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             int64_t               ne0, | ||||
|             size_t                offset); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_view_2d( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             int64_t               ne0, | ||||
|             int64_t               ne1, | ||||
|             size_t                nb1, // row stride in bytes
 | ||||
|             size_t                offset); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_view_3d( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             int64_t               ne0, | ||||
|             int64_t               ne1, | ||||
|             int64_t               ne2, | ||||
|             size_t                nb1, // row   stride in bytes
 | ||||
|             size_t                nb2, // slice stride in bytes
 | ||||
|             size_t                offset); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_permute( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             int                   axis0, | ||||
|             int                   axis1, | ||||
|             int                   axis2, | ||||
|             int                   axis3); | ||||
| 
 | ||||
|     // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
 | ||||
|     GGML_API struct ggml_tensor * ggml_transpose( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_get_rows( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     // set elements above the diagonal to -INF
 | ||||
|     // in-place, returns view(a)
 | ||||
|     GGML_API struct ggml_tensor * ggml_diag_mask_inf( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             int                   n_past); | ||||
| 
 | ||||
|     // in-place, returns view(a)
 | ||||
|     GGML_API struct ggml_tensor * ggml_soft_max( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a); | ||||
| 
 | ||||
|     // rotary position embedding
 | ||||
|     // in-place, returns view(a)
 | ||||
|     // if mode & 1 == 1, skip n_past elements
 | ||||
|     // if mode & 2 == 1, GPT-NeoX style
 | ||||
|     // TODO: avoid creating a new tensor every time
 | ||||
|     GGML_API struct ggml_tensor * ggml_rope( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             int                   n_past, | ||||
|             int                   n_dims, | ||||
|             int                   mode); | ||||
| 
 | ||||
|     // padding = 1
 | ||||
|     // TODO: we don't support extra parameters for now
 | ||||
|     //       that's why we are hard-coding the stride, padding, and dilation
 | ||||
|     //       not great ..
 | ||||
|     GGML_API struct ggml_tensor * ggml_conv_1d_1s( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_conv_1d_2s( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_flash_attn( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * q, | ||||
|             struct ggml_tensor  * k, | ||||
|             struct ggml_tensor  * v, | ||||
|             bool                  masked); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_flash_ff( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor  * a, | ||||
|             struct ggml_tensor  * b0, | ||||
|             struct ggml_tensor  * b1, | ||||
|             struct ggml_tensor  * c0, | ||||
|             struct ggml_tensor  * c1); | ||||
| 
 | ||||
|     // Mapping operations
 | ||||
|     GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); | ||||
|     GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_map_unary_f32( | ||||
|             struct ggml_context        * ctx, | ||||
|             struct ggml_tensor         * a, | ||||
|             const  ggml_unary_op_f32_t fun); | ||||
| 
 | ||||
|     GGML_API struct ggml_tensor * ggml_map_binary_f32( | ||||
|             struct ggml_context         * ctx, | ||||
|             struct ggml_tensor          * a, | ||||
|             struct ggml_tensor          * b, | ||||
|             const  ggml_binary_op_f32_t fun); | ||||
| 
 | ||||
|     //
 | ||||
|     // automatic differentiation
 | ||||
|     //
 | ||||
| 
 | ||||
|     GGML_API void ggml_set_param( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_tensor * tensor); | ||||
| 
 | ||||
|     GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); | ||||
| 
 | ||||
|     GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); | ||||
|     GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); | ||||
| 
 | ||||
|     GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); | ||||
|     GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph); | ||||
| 
 | ||||
|     // print info and performance information for the graph
 | ||||
|     GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); | ||||
| 
 | ||||
|     // dump the graph into a file using the dot format
 | ||||
|     GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); | ||||
| 
 | ||||
|     //
 | ||||
|     // optimization
 | ||||
|     //
 | ||||
| 
 | ||||
|     // optimization methods
 | ||||
|     enum ggml_opt_type { | ||||
|         GGML_OPT_ADAM, | ||||
|         GGML_OPT_LBFGS, | ||||
|     }; | ||||
| 
 | ||||
|     // linesearch methods
 | ||||
|     enum ggml_linesearch { | ||||
|         GGML_LINESEARCH_DEFAULT = 1, | ||||
| 
 | ||||
|         GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0, | ||||
|         GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1, | ||||
|         GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, | ||||
|     }; | ||||
| 
 | ||||
|     // optimization return values
 | ||||
|     enum ggml_opt_result { | ||||
|         GGML_OPT_OK = 0, | ||||
|         GGML_OPT_DID_NOT_CONVERGE, | ||||
|         GGML_OPT_NO_CONTEXT, | ||||
|         GGML_OPT_INVALID_WOLFE, | ||||
|         GGML_OPT_FAIL, | ||||
| 
 | ||||
|         GGML_LINESEARCH_FAIL = -128, | ||||
|         GGML_LINESEARCH_MINIMUM_STEP, | ||||
|         GGML_LINESEARCH_MAXIMUM_STEP, | ||||
|         GGML_LINESEARCH_MAXIMUM_ITERATIONS, | ||||
|         GGML_LINESEARCH_INVALID_PARAMETERS, | ||||
|     }; | ||||
| 
 | ||||
|     // optimization parameters
 | ||||
|     //
 | ||||
|     //   see ggml.c (ggml_opt_default_params) for default values
 | ||||
|     //
 | ||||
|     struct ggml_opt_params { | ||||
|         enum ggml_opt_type type; | ||||
| 
 | ||||
|         int n_threads; | ||||
| 
 | ||||
|         // delta-based convergence test
 | ||||
|         //
 | ||||
|         //   if past == 0 - disabled
 | ||||
|         //   if past > 0:
 | ||||
|         //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
 | ||||
|         //
 | ||||
|         int past; | ||||
|         float delta; | ||||
| 
 | ||||
|         // maximum number of iterations without improvement
 | ||||
|         //
 | ||||
|         //   if 0 - disabled
 | ||||
|         //   if > 0:
 | ||||
|         //     assume convergence if no cost improvement in this number of iterations
 | ||||
|         //
 | ||||
|         int max_no_improvement; | ||||
| 
 | ||||
|         bool print_forward_graph; | ||||
|         bool print_backward_graph; | ||||
| 
 | ||||
|         // ADAM parameters
 | ||||
|         struct { | ||||
|             int n_iter; | ||||
| 
 | ||||
|             float alpha; // learning rate
 | ||||
|             float beta1; | ||||
|             float beta2; | ||||
|             float eps;   // epsilon for numerical stability
 | ||||
|             float eps_f; // epsilon for convergence test
 | ||||
|             float eps_g; // epsilon for convergence test
 | ||||
|         } adam; | ||||
| 
 | ||||
|         // LBFGS parameters
 | ||||
|         struct { | ||||
|             int m; // number of corrections to approximate the inv. Hessian
 | ||||
|             int n_iter; | ||||
|             int max_linesearch; | ||||
| 
 | ||||
|             float eps;      // convergence tolerance
 | ||||
|             float ftol;     // line search tolerance
 | ||||
|             float wolfe; | ||||
|             float min_step; | ||||
|             float max_step; | ||||
| 
 | ||||
|             enum ggml_linesearch linesearch; | ||||
|         } lbfgs; | ||||
|     }; | ||||
| 
 | ||||
|     GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); | ||||
| 
 | ||||
|     // optimize the function defined by the tensor f
 | ||||
|     GGML_API enum ggml_opt_result ggml_opt( | ||||
|             struct ggml_context * ctx, | ||||
|             struct ggml_opt_params params, | ||||
|             struct ggml_tensor * f); | ||||
| 
 | ||||
|     //
 | ||||
|     // quantization
 | ||||
|     //
 | ||||
| 
 | ||||
|     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); | ||||
|     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); | ||||
|     GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist); | ||||
|     GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist); | ||||
|     GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist); | ||||
|     GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist); | ||||
|     GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist); | ||||
| 
 | ||||
|     GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); | ||||
| 
 | ||||
|     //
 | ||||
|     // system info
 | ||||
|     //
 | ||||
| 
 | ||||
|     GGML_API int ggml_cpu_has_avx        (void); | ||||
|     GGML_API int ggml_cpu_has_avx2       (void); | ||||
|     GGML_API int ggml_cpu_has_avx512     (void); | ||||
|     GGML_API int ggml_cpu_has_avx512_vbmi(void); | ||||
|     GGML_API int ggml_cpu_has_avx512_vnni(void); | ||||
|     GGML_API int ggml_cpu_has_fma        (void); | ||||
|     GGML_API int ggml_cpu_has_neon       (void); | ||||
|     GGML_API int ggml_cpu_has_arm_fma    (void); | ||||
|     GGML_API int ggml_cpu_has_f16c       (void); | ||||
|     GGML_API int ggml_cpu_has_fp16_va    (void); | ||||
|     GGML_API int ggml_cpu_has_wasm_simd  (void); | ||||
|     GGML_API int ggml_cpu_has_blas       (void); | ||||
|     GGML_API int ggml_cpu_has_cublas     (void); | ||||
|     GGML_API int ggml_cpu_has_sse3       (void); | ||||
|     GGML_API int ggml_cpu_has_vsx        (void); | ||||
| 
 | ||||
| 
 | ||||
|     //
 | ||||
|     // Internal types and functions exposed for tests and benchmarks
 | ||||
|     //
 | ||||
| 
 | ||||
| #ifdef  __cplusplus | ||||
|     // restrict not standard in C++
 | ||||
| #define GGML_RESTRICT | ||||
| #else | ||||
| #define GGML_RESTRICT restrict | ||||
| #endif | ||||
|     typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); | ||||
|     typedef void (*quantize_row_q_t)  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); | ||||
|     typedef void (*vec_dot_q_t)       (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); | ||||
| 
 | ||||
|     typedef struct { | ||||
|         dequantize_row_q_t dequantize_row_q; | ||||
|         quantize_row_q_t   quantize_row_q; | ||||
|         quantize_row_q_t   quantize_row_q_reference; | ||||
|         quantize_row_q_t   quantize_row_q_dot; | ||||
|         vec_dot_q_t        vec_dot_q; | ||||
|         enum ggml_type     vec_dot_type; | ||||
|     } quantize_fns_t; | ||||
| 
 | ||||
|     quantize_fns_t ggml_internal_get_quantize_fn(size_t i); | ||||
| 
 | ||||
| COSMOPOLITAN_C_END_ | ||||
| #endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */ | ||||
| #endif /* COSMOPOLITAN_THIRD_PARTY_LLAMA_CPP_GGML_H_ */ | ||||
							
								
								
									
										114
									
								
								third_party/ggml/ggml.mk
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										114
									
								
								third_party/ggml/ggml.mk
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,114 @@ | |||
| #-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
 | ||||
| #───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘
 | ||||
| 
 | ||||
| PKGS += THIRD_PARTY_GGML | ||||
| 
 | ||||
| ################################################################################
 | ||||
| # single file machine learning framework written in c
 | ||||
| # make -j8 o//third_party/ggml/ggml.a
 | ||||
| 
 | ||||
| THIRD_PARTY_GGML_ARTIFACTS += THIRD_PARTY_GGML_A | ||||
| THIRD_PARTY_GGML = $(THIRD_PARTY_GGML_A_DEPS) $(THIRD_PARTY_GGML_A) | ||||
| THIRD_PARTY_GGML_A = o/$(MODE)/third_party/ggml/ggml.a | ||||
| THIRD_PARTY_GGML_A_HDRS = third_party/ggml/ggml.h | ||||
| THIRD_PARTY_GGML_A_SRCS = third_party/ggml/ggml.c | ||||
| THIRD_PARTY_GGML_A_OBJS = $(THIRD_PARTY_GGML_A_SRCS:%.c=o/$(MODE)/%.o) | ||||
| THIRD_PARTY_GGML_A_FILES = $(THIRD_PARTY_GGML_A_SRCS) $(THIRD_PARTY_GGML_A_HDRS) | ||||
| THIRD_PARTY_GGML_A_CHECKS = $(THIRD_PARTY_GGML_A).pkg $(THIRD_PARTY_GGML_A_HDRS:%=o/$(MODE)/%.ok) | ||||
| 
 | ||||
| THIRD_PARTY_GGML_A_DIRECTDEPS =					\
 | ||||
| 	LIBC_CALLS						\
 | ||||
| 	LIBC_INTRIN						\
 | ||||
| 	LIBC_MEM						\
 | ||||
| 	LIBC_NEXGEN32E						\
 | ||||
| 	LIBC_RUNTIME						\
 | ||||
| 	LIBC_STDIO						\
 | ||||
| 	LIBC_THREAD						\
 | ||||
| 	LIBC_STR						\
 | ||||
| 	LIBC_STUBS						\
 | ||||
| 	LIBC_SYSV						\
 | ||||
| 	LIBC_TINYMATH | ||||
| 
 | ||||
| THIRD_PARTY_GGML_A_DEPS :=					\
 | ||||
| 	$(call uniq,$(foreach x,$(THIRD_PARTY_GGML_A_DIRECTDEPS),$($(x)))) | ||||
| 
 | ||||
| $(THIRD_PARTY_GGML_A):						\ | ||||
| 		third_party/ggml/				\
 | ||||
| 		$(THIRD_PARTY_GGML_A).pkg			\
 | ||||
| 		$(THIRD_PARTY_GGML_A_OBJS) | ||||
| 
 | ||||
| $(THIRD_PARTY_GGML_A).pkg:					\ | ||||
| 		$(THIRD_PARTY_GGML_A_OBJS)			\
 | ||||
| 		$(foreach x,$(THIRD_PARTY_GGML_A_DIRECTDEPS),$($(x)_A).pkg) | ||||
| 
 | ||||
| $(THIRD_PARTY_GGML_A_OBJS): private				\ | ||||
| 		OVERRIDE_CFLAGS +=				\
 | ||||
| 			-O3					\
 | ||||
| 			-ffunction-sections			\
 | ||||
| 			-fdata-sections				\
 | ||||
| 			-msse3					\
 | ||||
| 			-mavx					\
 | ||||
| 			-mavx2					\
 | ||||
| 			-mf16c					\
 | ||||
| 			-mfma | ||||
| 
 | ||||
| ################################################################################
 | ||||
| # command for running inference on large language models
 | ||||
| # make -j8 o//third_party/ggml/llama.com
 | ||||
| 
 | ||||
| THIRD_PARTY_GGML_ARTIFACTS += THIRD_PARTY_GGML_LLAMA | ||||
| THIRD_PARTY_GGML_LLAMA = o/$(MODE)/third_party/ggml/llama.com | ||||
| THIRD_PARTY_GGML_LLAMA_HDRS = third_party/ggml/llama.h third_party/ggml/llama_util.h third_party/ggml/common.h | ||||
| THIRD_PARTY_GGML_LLAMA_SRCS = third_party/ggml/main.cc third_party/ggml/llama.cc third_party/ggml/common.cc | ||||
| THIRD_PARTY_GGML_LLAMA_OBJS = $(THIRD_PARTY_GGML_LLAMA_SRCS:%.cc=o/$(MODE)/%.o) | ||||
| THIRD_PARTY_GGML_LLAMA_FILES := $(THIRD_PARTY_GGML_LLAMA_SRCS) $(THIRD_PARTY_GGML_LLAMA_HDRS) | ||||
| THIRD_PARTY_GGML_LLAMA_CHECKS = $(THIRD_PARTY_GGML_LLAMA).pkg $(THIRD_PARTY_GGML_LLAMA_HDRS:%=o/$(MODE)/%.okk) | ||||
| 
 | ||||
| THIRD_PARTY_GGML_LLAMA_DIRECTDEPS =				\
 | ||||
| 	LIBC_CALLS						\
 | ||||
| 	LIBC_FMT						\
 | ||||
| 	LIBC_INTRIN						\
 | ||||
| 	LIBC_MEM						\
 | ||||
| 	LIBC_NEXGEN32E						\
 | ||||
| 	LIBC_RUNTIME						\
 | ||||
| 	LIBC_STDIO						\
 | ||||
| 	LIBC_STR						\
 | ||||
| 	LIBC_STUBS						\
 | ||||
| 	LIBC_SYSV						\
 | ||||
| 	LIBC_THREAD						\
 | ||||
| 	LIBC_TINYMATH						\
 | ||||
| 	THIRD_PARTY_GGML					\
 | ||||
| 	THIRD_PARTY_LIBCXX | ||||
| 
 | ||||
| THIRD_PARTY_GGML_LLAMA_DEPS :=					\
 | ||||
| 	$(call uniq,$(foreach x,$(THIRD_PARTY_GGML_LLAMA_DIRECTDEPS),$($(x)))) | ||||
| 
 | ||||
| $(THIRD_PARTY_GGML_LLAMA).dbg:					\ | ||||
| 		$(THIRD_PARTY_GGML_LLAMA).pkg			\
 | ||||
| 		$(THIRD_PARTY_GGML_LLAMA_DEPS)			\
 | ||||
| 		o/$(MODE)/third_party/ggml/common.o		\
 | ||||
| 		o/$(MODE)/third_party/ggml/llama.o		\
 | ||||
| 		o/$(MODE)/third_party/ggml/main.o		\
 | ||||
| 		$(CRT)						\
 | ||||
| 		$(APE_NO_MODIFY_SELF) | ||||
| 	@$(APELINK) | ||||
| 
 | ||||
| $(THIRD_PARTY_GGML_LLAMA).pkg:					\ | ||||
| 		$(THIRD_PARTY_GGML_LLAMA_OBJS)			\
 | ||||
| 		$(foreach x,$(THIRD_PARTY_GGML_LLAMA_DIRECTDEPS),$($(x)_A).pkg) | ||||
| 
 | ||||
| ################################################################################
 | ||||
| 
 | ||||
| THIRD_PARTY_GGML_COMS = $(THIRD_PARTY_GGML_LLAMA) | ||||
| THIRD_PARTY_GGML_BINS = $(THIRD_PARTY_GGML_COMS) $(THIRD_PARTY_GGML_COMS:%=%.dbg) | ||||
| THIRD_PARTY_GGML_LIBS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x))) | ||||
| THIRD_PARTY_GGML_SRCS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_SRCS)) | ||||
| THIRD_PARTY_GGML_HDRS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_HDRS)) | ||||
| THIRD_PARTY_GGML_OBJS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_OBJS)) | ||||
| THIRD_PARTY_GGML_CHECKS = $(foreach x,$(THIRD_PARTY_GGML_ARTIFACTS),$($(x)_CHECKS)) | ||||
| $(THIRD_PARTY_GGML_OBJS): third_party/ggml/ggml.mk | ||||
| 
 | ||||
| .PHONY: o/$(MODE)/third_party/ggml | ||||
| o/$(MODE)/third_party/ggml:					\ | ||||
| 		$(THIRD_PARTY_GGML_BINS)			\
 | ||||
| 		$(THIRD_PARTY_GGML_CHECKS) | ||||
							
								
								
									
										2472
									
								
								third_party/ggml/llama.cc
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										2472
									
								
								third_party/ggml/llama.cc
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										211
									
								
								third_party/ggml/llama.h
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										211
									
								
								third_party/ggml/llama.h
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,211 @@ | |||
| // -*- c++ -*-
 | ||||
| // clang-format off
 | ||||
| #ifndef LLAMA_H | ||||
| #define LLAMA_H | ||||
| 
 | ||||
| #ifdef LLAMA_SHARED | ||||
| #    if defined(_WIN32) && !defined(__MINGW32__) | ||||
| #        ifdef LLAMA_BUILD | ||||
| #            define LLAMA_API __declspec(dllexport) | ||||
| #        else | ||||
| #            define LLAMA_API __declspec(dllimport) | ||||
| #        endif | ||||
| #    else | ||||
| #        define LLAMA_API __attribute__ ((visibility ("default"))) | ||||
| #    endif | ||||
| #else | ||||
| #    define LLAMA_API | ||||
| #endif | ||||
| 
 | ||||
| #define LLAMA_FILE_VERSION 1 | ||||
| #define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
 | ||||
| #define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
 | ||||
| 
 | ||||
| #ifdef __cplusplus | ||||
| extern "C" { | ||||
| #endif | ||||
| 
 | ||||
|     //
 | ||||
|     // C interface
 | ||||
|     //
 | ||||
|     // TODO: show sample usage
 | ||||
|     //
 | ||||
| 
 | ||||
|     struct llama_context; | ||||
| 
 | ||||
|     typedef int llama_token; | ||||
| 
 | ||||
|     typedef struct llama_token_data { | ||||
|         llama_token id;  // token id
 | ||||
| 
 | ||||
|         float p;     // probability of the token
 | ||||
|         float plog;  // log probability of the token
 | ||||
| 
 | ||||
|     } llama_token_data; | ||||
| 
 | ||||
|     typedef void (*llama_progress_callback)(float progress, void *ctx); | ||||
| 
 | ||||
|     struct llama_context_params { | ||||
|         int n_ctx;   // text context
 | ||||
|         int n_parts; // -1 for default
 | ||||
|         int seed;    // RNG seed, 0 for random
 | ||||
| 
 | ||||
|         bool f16_kv;     // use fp16 for KV cache
 | ||||
|         bool logits_all; // the llama_eval() call computes all logits, not just the last one
 | ||||
|         bool vocab_only; // only load the vocabulary, no weights
 | ||||
|         bool use_mmap;   // use mmap if possible
 | ||||
|         bool use_mlock;  // force system to keep model in RAM
 | ||||
|         bool embedding;  // embedding mode only
 | ||||
| 
 | ||||
|         // called with a progress value between 0 and 1, pass NULL to disable
 | ||||
|         llama_progress_callback progress_callback; | ||||
|         // context pointer passed to the progress callback
 | ||||
|         void * progress_callback_user_data; | ||||
|     }; | ||||
| 
 | ||||
|     // model file types
 | ||||
|     enum llama_ftype { | ||||
|         LLAMA_FTYPE_ALL_F32     = 0, | ||||
|         LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
 | ||||
|         LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
 | ||||
|         LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
 | ||||
|         LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
 | ||||
|         LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
 | ||||
|         LLAMA_FTYPE_MOSTLY_Q4_3 = 6,  // except 1d tensors
 | ||||
|         LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
 | ||||
|         LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
 | ||||
|         LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
 | ||||
|     }; | ||||
| 
 | ||||
|     LLAMA_API struct llama_context_params llama_context_default_params(); | ||||
| 
 | ||||
|     LLAMA_API bool llama_mmap_supported(); | ||||
|     LLAMA_API bool llama_mlock_supported(); | ||||
| 
 | ||||
|     // Various functions for loading a ggml llama model.
 | ||||
|     // Allocate (almost) all memory needed for the model.
 | ||||
|     // Return NULL on failure
 | ||||
|     LLAMA_API struct llama_context * llama_init_from_file( | ||||
|                              const char * path_model, | ||||
|             struct llama_context_params   params); | ||||
| 
 | ||||
|     // Frees all allocated memory
 | ||||
|     LLAMA_API void llama_free(struct llama_context * ctx); | ||||
| 
 | ||||
|     // TODO: not great API - very likely to change
 | ||||
|     // Returns 0 on success
 | ||||
|     // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
 | ||||
|     LLAMA_API int llama_model_quantize( | ||||
|             const char * fname_inp, | ||||
|             const char * fname_out, | ||||
|       enum llama_ftype   ftype, | ||||
|             int          nthread); | ||||
| 
 | ||||
|     // Apply a LoRA adapter to a loaded model
 | ||||
|     // path_base_model is the path to a higher quality model to use as a base for
 | ||||
|     // the layers modified by the adapter. Can be NULL to use the current loaded model.
 | ||||
|     // The model needs to be reloaded before applying a new adapter, otherwise the adapter
 | ||||
|     // will be applied on top of the previous one
 | ||||
|     // Returns 0 on success
 | ||||
|     LLAMA_API int llama_apply_lora_from_file( | ||||
|             struct llama_context * ctx, | ||||
|                       const char * path_lora, | ||||
|                       const char * path_base_model, | ||||
|                              int   n_threads); | ||||
| 
 | ||||
|     // Returns the number of tokens in the KV cache
 | ||||
|     LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx); | ||||
| 
 | ||||
|     // Sets the current rng seed.
 | ||||
|     LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); | ||||
| 
 | ||||
|     // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
 | ||||
|     LLAMA_API size_t llama_get_state_size(struct llama_context * ctx); | ||||
| 
 | ||||
|     // Copies the state to the specified destination address.
 | ||||
|     // Destination needs to have allocated enough memory.
 | ||||
|     // Returns the number of bytes copied
 | ||||
|     LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest); | ||||
| 
 | ||||
|     // Set the state reading from the specified address
 | ||||
|     // Returns the number of bytes read
 | ||||
|     LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src); | ||||
| 
 | ||||
|     // Run the llama inference to obtain the logits and probabilities for the next token.
 | ||||
|     // tokens + n_tokens is the provided batch of new tokens to process
 | ||||
|     // n_past is the number of tokens to use from previous eval calls
 | ||||
|     // Returns 0 on success
 | ||||
|     LLAMA_API int llama_eval( | ||||
|             struct llama_context * ctx, | ||||
|                const llama_token * tokens, | ||||
|                              int   n_tokens, | ||||
|                              int   n_past, | ||||
|                              int   n_threads); | ||||
| 
 | ||||
|     // Convert the provided text into tokens.
 | ||||
|     // The tokens pointer must be large enough to hold the resulting tokens.
 | ||||
|     // Returns the number of tokens on success, no more than n_max_tokens
 | ||||
|     // Returns a negative number on failure - the number of tokens that would have been returned
 | ||||
|     // TODO: not sure if correct
 | ||||
|     LLAMA_API int llama_tokenize( | ||||
|             struct llama_context * ctx, | ||||
|                       const char * text, | ||||
|                      llama_token * tokens, | ||||
|                              int   n_max_tokens, | ||||
|                             bool   add_bos); | ||||
| 
 | ||||
|     LLAMA_API int llama_n_vocab(struct llama_context * ctx); | ||||
|     LLAMA_API int llama_n_ctx  (struct llama_context * ctx); | ||||
|     LLAMA_API int llama_n_embd (struct llama_context * ctx); | ||||
| 
 | ||||
|     // Token logits obtained from the last call to llama_eval()
 | ||||
|     // The logits for the last token are stored in the last row
 | ||||
|     // Can be mutated in order to change the probabilities of the next token
 | ||||
|     // Rows: n_tokens
 | ||||
|     // Cols: n_vocab
 | ||||
|     LLAMA_API float * llama_get_logits(struct llama_context * ctx); | ||||
| 
 | ||||
|     // Get the embeddings for the input
 | ||||
|     // shape: [n_embd] (1-dimensional)
 | ||||
|     LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); | ||||
| 
 | ||||
|     // Token Id -> String. Uses the vocabulary in the provided context
 | ||||
|     LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token); | ||||
| 
 | ||||
|     // Special tokens
 | ||||
|     LLAMA_API llama_token llama_token_bos(); | ||||
|     LLAMA_API llama_token llama_token_eos(); | ||||
| 
 | ||||
|     // TODO: improve the last_n_tokens interface ?
 | ||||
|     LLAMA_API llama_token llama_sample_top_p_top_k( | ||||
|        struct llama_context * ctx, | ||||
|           const llama_token * last_n_tokens_data, | ||||
|                         int   last_n_tokens_size, | ||||
|                         int   top_k, | ||||
|                       float   top_p, | ||||
|                       float   temp, | ||||
|                       float   repeat_penalty); | ||||
| 
 | ||||
|     // Performance information
 | ||||
|     LLAMA_API void llama_print_timings(struct llama_context * ctx); | ||||
|     LLAMA_API void llama_reset_timings(struct llama_context * ctx); | ||||
| 
 | ||||
|     // Print system information
 | ||||
|     LLAMA_API const char * llama_print_system_info(void); | ||||
| 
 | ||||
| #ifdef __cplusplus | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 | ||||
| #ifdef LLAMA_API_INTERNAL | ||||
| 
 | ||||
| #include "third_party/libcxx/vector" | ||||
| #include "third_party/libcxx/string" | ||||
| struct ggml_tensor; | ||||
| 
 | ||||
| std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx); | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #endif // LLAMA_H
 | ||||
							
								
								
									
										388
									
								
								third_party/ggml/llama_util.h
									
										
									
									
										vendored
									
									
										Executable file
									
								
							
							
						
						
									
										388
									
								
								third_party/ggml/llama_util.h
									
										
									
									
										vendored
									
									
										Executable file
									
								
							|  | @ -0,0 +1,388 @@ | |||
| // Internal header to be included only by llama.cpp.
 | ||||
| // Contains wrappers around OS interfaces.
 | ||||
| 
 | ||||
| #ifndef LLAMA_UTIL_H | ||||
| #define LLAMA_UTIL_H | ||||
| #include "libc/calls/struct/rlimit.h" | ||||
| #include "libc/fmt/fmt.h" | ||||
| #include "libc/runtime/sysconf.h" | ||||
| #include "libc/sysv/consts/madv.h" | ||||
| #include "libc/sysv/consts/map.h" | ||||
| #include "libc/sysv/consts/prot.h" | ||||
| #include "libc/sysv/consts/rlimit.h" | ||||
| #include "third_party/libcxx/cerrno" | ||||
| #include "third_party/libcxx/climits" | ||||
| #include "third_party/libcxx/cstdarg" | ||||
| #include "third_party/libcxx/cstdint" | ||||
| #include "third_party/libcxx/cstdio" | ||||
| #include "third_party/libcxx/cstdlib" | ||||
| #include "third_party/libcxx/cstring" | ||||
| #include "third_party/libcxx/string" | ||||
| #include "third_party/libcxx/vector" | ||||
| // clang-format off
 | ||||
| 
 | ||||
| #define LLAMA_ASSERT(x) \ | ||||
|     do { \ | ||||
|         if (!(x)) { \ | ||||
|             fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ | ||||
|             abort(); \ | ||||
|         } \ | ||||
|     } while (0) | ||||
| 
 | ||||
| #ifdef __GNUC__ | ||||
| #ifdef __MINGW32__ | ||||
| __attribute__((__format__(__gnu_printf__, 1, 2))) | ||||
| #else | ||||
| __attribute__((__format__(__printf__, 1, 2))) | ||||
| #endif | ||||
| __attribute__((__noreturn__)) | ||||
| #endif | ||||
| static void Die(const char *fmt, ...) { | ||||
|     va_list va; | ||||
|     va_start(va, fmt); | ||||
|     vfprintf(stderr, fmt, va); | ||||
|     va_end(va); | ||||
|     fputc('\n', stderr); | ||||
|     exit(1); | ||||
| } | ||||
| 
 | ||||
| struct llama_file { | ||||
|     // use FILE * so we don't have to re-open the file to mmap
 | ||||
|     FILE * fp; | ||||
|     size_t size; | ||||
| 
 | ||||
|     llama_file(const char * fname, const char * mode) { | ||||
|         fp = std::fopen(fname, mode); | ||||
|         if (fp == NULL) { | ||||
|             Die("failed to open %s: %s", fname, std::strerror(errno)); | ||||
|         } | ||||
|         seek(0, SEEK_END); | ||||
|         size = tell(); | ||||
|         seek(0, SEEK_SET); | ||||
|     } | ||||
| 
 | ||||
|     size_t tell() const { | ||||
| #ifdef _WIN32 | ||||
|         __int64 ret = _ftelli64(fp); | ||||
| #else | ||||
|         long ret = std::ftell(fp); | ||||
| #endif | ||||
|         LLAMA_ASSERT(ret != -1); // this really shouldn't fail
 | ||||
|         return (size_t) ret; | ||||
|     } | ||||
| 
 | ||||
|     void seek(size_t offset, int whence) { | ||||
| #ifdef _WIN32 | ||||
|         int ret = _fseeki64(fp, (__int64) offset, whence); | ||||
| #else | ||||
|         int ret = std::fseek(fp, (long) offset, whence); | ||||
| #endif | ||||
|         LLAMA_ASSERT(ret == 0); // same
 | ||||
|     } | ||||
| 
 | ||||
|     void read_raw(void * ptr, size_t size) { | ||||
|         if (size == 0) { | ||||
|             return; | ||||
|         } | ||||
|         errno = 0; | ||||
|         std::size_t ret = std::fread(ptr, size, 1, fp); | ||||
|         if (ferror(fp)) { | ||||
|             Die("read error: %s", strerror(errno)); | ||||
|         } | ||||
|         if (ret != 1) { | ||||
|             Die("unexpectedly reached end of file"); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     std::uint32_t read_u32() { | ||||
|         std::uint32_t ret; | ||||
|         read_raw(&ret, sizeof(ret)); | ||||
|         return ret; | ||||
|     } | ||||
| 
 | ||||
|     std::string read_string(std::uint32_t len) { | ||||
|         std::vector<char> chars(len); | ||||
|         read_raw(chars.data(), len); | ||||
|         return std::string(chars.data(), len); | ||||
|     } | ||||
| 
 | ||||
|     void write_raw(const void * ptr, size_t size) { | ||||
|         if (size == 0) { | ||||
|             return; | ||||
|         } | ||||
|         errno = 0; | ||||
|         size_t ret = std::fwrite(ptr, size, 1, fp); | ||||
|         if (ret != 1) { | ||||
|             Die("write error: %s", strerror(errno)); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     void write_u32(std::uint32_t val) { | ||||
|         write_raw(&val, sizeof(val)); | ||||
|     } | ||||
| 
 | ||||
|     ~llama_file() { | ||||
|         if (fp) { | ||||
|             std::fclose(fp); | ||||
|         } | ||||
|     } | ||||
| }; | ||||
| 
 | ||||
| #if defined(_WIN32) | ||||
| static std::string llama_format_win_err(DWORD err) { | ||||
|     LPSTR buf; | ||||
|     size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, | ||||
|                                  NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); | ||||
|     if (!size) { | ||||
|         return "FormatMessageA failed"; | ||||
|     } | ||||
|     std::string ret(buf, size); | ||||
|     LocalFree(buf); | ||||
|     return ret; | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| struct llama_mmap { | ||||
|     void * addr; | ||||
|     size_t size; | ||||
| 
 | ||||
|     llama_mmap(const llama_mmap &) = delete; | ||||
| 
 | ||||
| #ifdef _POSIX_MAPPED_FILES | ||||
|     static constexpr bool SUPPORTED = true; | ||||
| 
 | ||||
|     llama_mmap(struct llama_file * file, bool prefetch = true) { | ||||
|         size = file->size; | ||||
|         int fd = fileno(file->fp); | ||||
|         int flags = MAP_SHARED; | ||||
| #ifdef __linux__ | ||||
|         flags |= MAP_POPULATE; | ||||
| #endif | ||||
|         addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); | ||||
|         if (addr == MAP_FAILED) { | ||||
|             Die("mmap failed: %s", strerror(errno)); | ||||
|         } | ||||
| 
 | ||||
|         if (prefetch) { | ||||
|             // Advise the kernel to preload the mapped memory
 | ||||
|             if (madvise(addr, file->size, MADV_WILLNEED)) { | ||||
|                 fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", | ||||
|                         strerror(errno)); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     ~llama_mmap() { | ||||
|         munmap(addr, size); | ||||
|     } | ||||
| #elif defined(_WIN32) | ||||
|     static constexpr bool SUPPORTED = true; | ||||
| 
 | ||||
|     llama_mmap(struct llama_file * file, bool prefetch = true) { | ||||
|         size = file->size; | ||||
| 
 | ||||
|         HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); | ||||
| 
 | ||||
|         HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); | ||||
|         DWORD error = GetLastError(); | ||||
| 
 | ||||
|         if (hMapping == NULL) { | ||||
|             Die("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()); | ||||
|         } | ||||
| 
 | ||||
|         addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); | ||||
|         error = GetLastError(); | ||||
|         CloseHandle(hMapping); | ||||
| 
 | ||||
|         if (addr == NULL) { | ||||
|             Die("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()); | ||||
|         } | ||||
| 
 | ||||
|         #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 | ||||
|         if (prefetch) { | ||||
|             // Advise the kernel to preload the mapped memory
 | ||||
|             WIN32_MEMORY_RANGE_ENTRY range; | ||||
|             range.VirtualAddress = addr; | ||||
|             range.NumberOfBytes = (SIZE_T)size; | ||||
|             if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { | ||||
|                 fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", | ||||
|                         llama_format_win_err(GetLastError()).c_str()); | ||||
|             } | ||||
|         } | ||||
|         #else | ||||
|         #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") | ||||
|         #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
 | ||||
|     } | ||||
| 
 | ||||
|     ~llama_mmap() { | ||||
|         if (!UnmapViewOfFile(addr)) { | ||||
|             fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", | ||||
|                     llama_format_win_err(GetLastError()).c_str()); | ||||
|         } | ||||
|     } | ||||
| #else | ||||
|     static constexpr bool SUPPORTED = false; | ||||
| 
 | ||||
|     llama_mmap(struct llama_file *) { | ||||
|         Die("mmap not supported"); | ||||
|     } | ||||
| #endif | ||||
| }; | ||||
| 
 | ||||
| // Represents some region of memory being locked using mlock or VirtualLock;
 | ||||
| // will automatically unlock on destruction.
 | ||||
| struct llama_mlock { | ||||
|     void * addr = NULL; | ||||
|     size_t size = 0; | ||||
|     bool failed_already = false; | ||||
| 
 | ||||
|     llama_mlock() {} | ||||
|     llama_mlock(const llama_mlock &) = delete; | ||||
| 
 | ||||
|     ~llama_mlock() { | ||||
|         if (size) { | ||||
|             raw_unlock(addr, size); | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     void init(void * addr) { | ||||
|         LLAMA_ASSERT(this->addr == NULL && this->size == 0); | ||||
|         this->addr = addr; | ||||
|     } | ||||
| 
 | ||||
|     void grow_to(size_t target_size) { | ||||
|         LLAMA_ASSERT(addr); | ||||
|         if (failed_already) { | ||||
|             return; | ||||
|         } | ||||
|         size_t granularity = lock_granularity(); | ||||
|         target_size = (target_size + granularity - 1) & ~(granularity - 1); | ||||
|         if (target_size > size) { | ||||
|             if (raw_lock((uint8_t *) addr + size, target_size - size)) { | ||||
|                 size = target_size; | ||||
|             } else { | ||||
|                 failed_already = true; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
| #ifdef _POSIX_MEMLOCK_RANGE | ||||
|     static constexpr bool SUPPORTED = true; | ||||
| 
 | ||||
|     size_t lock_granularity() { | ||||
|         return (size_t) sysconf(_SC_PAGESIZE); | ||||
|     } | ||||
| 
 | ||||
|     #ifdef __APPLE__ | ||||
|         #define MLOCK_SUGGESTION \ | ||||
|             "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ | ||||
|             "decreasing 'vm.global_no_user_wire_amount'.  Also try increasing RLIMIT_MLOCK (ulimit -l).\n" | ||||
|     #else | ||||
|         #define MLOCK_SUGGESTION \ | ||||
|             "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" | ||||
|     #endif | ||||
| 
 | ||||
|     bool raw_lock(const void * addr, size_t size) { | ||||
|         if (!mlock(addr, size)) { | ||||
|             return true; | ||||
|         } else { | ||||
|             char* errmsg = std::strerror(errno); | ||||
|             bool suggest = (errno == ENOMEM); | ||||
| 
 | ||||
|             // Check if the resource limit is fine after all
 | ||||
|             struct rlimit lock_limit; | ||||
|             if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) | ||||
|                 suggest = false; | ||||
|             if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) | ||||
|                 suggest = false; | ||||
| 
 | ||||
|             fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", | ||||
|                     size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); | ||||
|             return false; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     #undef MLOCK_SUGGESTION | ||||
| 
 | ||||
|     void raw_unlock(void * addr, size_t size) { | ||||
|         if (munlock(addr, size)) { | ||||
|             fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); | ||||
|         } | ||||
|     } | ||||
| #elif defined(_WIN32) | ||||
|     static constexpr bool SUPPORTED = true; | ||||
| 
 | ||||
|     size_t lock_granularity() { | ||||
|         SYSTEM_INFO si; | ||||
|         GetSystemInfo(&si); | ||||
|         return (size_t) si.dwPageSize; | ||||
|     } | ||||
| 
 | ||||
|     bool raw_lock(void * addr, size_t size) { | ||||
|         for (int tries = 1; ; tries++) { | ||||
|             if (VirtualLock(addr, size)) { | ||||
|                 return true; | ||||
|             } | ||||
|             if (tries == 2) { | ||||
|                 fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", | ||||
|                         size, this->size, llama_format_win_err(GetLastError()).c_str()); | ||||
|                 return false; | ||||
|             } | ||||
| 
 | ||||
|             // It failed but this was only the first try; increase the working
 | ||||
|             // set size and try again.
 | ||||
|             SIZE_T min_ws_size, max_ws_size; | ||||
|             if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { | ||||
|                 fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", | ||||
|                         llama_format_win_err(GetLastError()).c_str()); | ||||
|                 return false; | ||||
|             } | ||||
|             // Per MSDN: "The maximum number of pages that a process can lock
 | ||||
|             // is equal to the number of pages in its minimum working set minus
 | ||||
|             // a small overhead."
 | ||||
|             // Hopefully a megabyte is enough overhead:
 | ||||
|             size_t increment = size + 1048576; | ||||
|             // The minimum must be <= the maximum, so we need to increase both:
 | ||||
|             min_ws_size += increment; | ||||
|             max_ws_size += increment; | ||||
|             if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { | ||||
|                 fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", | ||||
|                         llama_format_win_err(GetLastError()).c_str()); | ||||
|                 return false; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     void raw_unlock(void * addr, size_t size) { | ||||
|         if (!VirtualUnlock(addr, size)) { | ||||
|             fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", | ||||
|                     llama_format_win_err(GetLastError()).c_str()); | ||||
|         } | ||||
|     } | ||||
| #else | ||||
|     static constexpr bool SUPPORTED = false; | ||||
| 
 | ||||
|     void raw_lock(const void * addr, size_t size) { | ||||
|         fprintf(stderr, "warning: mlock not supported on this system\n"); | ||||
|     } | ||||
| 
 | ||||
|     void raw_unlock(const void * addr, size_t size) {} | ||||
| #endif | ||||
| }; | ||||
| 
 | ||||
| // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
 | ||||
| struct llama_buffer { | ||||
|     uint8_t * addr = NULL; | ||||
|     size_t size = 0; | ||||
| 
 | ||||
|     void resize(size_t size) { | ||||
|         delete[] addr; | ||||
|         addr = new uint8_t[size]; | ||||
|         this->size = size; | ||||
|     } | ||||
| 
 | ||||
|     ~llama_buffer() { | ||||
|         delete[] addr; | ||||
|     } | ||||
| }; | ||||
| #endif | ||||
							
								
								
									
										568
									
								
								third_party/ggml/main.cc
									
										
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										568
									
								
								third_party/ggml/main.cc
									
										
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,568 @@ | |||
| /*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;tab-width:8;coding:utf-8 -*-│
 | ||||
| │vi: set net ft=c ts=4 sts=4 sw=4 fenc=utf-8                                :vi│ | ||||
| ╚──────────────────────────────────────────────────────────────────────────────╝ | ||||
| │                                                                              │ | ||||
| │  llama.cpp                                                                   │ | ||||
| │  Copyright (c) 2023 Georgi Gerganov                                          │ | ||||
| │                                                                              │ | ||||
| │  Permission is hereby granted, free of charge, to any person obtaining       │ | ||||
| │  a copy of this software and associated documentation files (the             │ | ||||
| │  "Software"), to deal in the Software without restriction, including         │ | ||||
| │  without limitation the rights to use, copy, modify, merge, publish,         │ | ||||
| │  distribute, sublicense, and/or sell copies of the Software, and to          │ | ||||
| │  permit persons to whom the Software is furnished to do so, subject to       │ | ||||
| │  the following conditions:                                                   │ | ||||
| │                                                                              │ | ||||
| │  The above copyright notice and this permission notice shall be              │ | ||||
| │  included in all copies or substantial portions of the Software.             │ | ||||
| │                                                                              │ | ||||
| │  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,             │ | ||||
| │  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF          │ | ||||
| │  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.      │ | ||||
| │  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY        │ | ||||
| │  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,        │ | ||||
| │  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE           │ | ||||
| │  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                      │ | ||||
| │                                                                              │ | ||||
| ╚─────────────────────────────────────────────────────────────────────────────*/ | ||||
| 
 | ||||
| asm(".ident\t\"\\n\\n\
 | ||||
| llama.cpp (MIT License)\\n\ | ||||
| Copyright (c) 2023 Georgi Gerganov\""); | ||||
| asm(".include \"libc/disclaimer.inc\""); | ||||
| // clang-format off
 | ||||
| 
 | ||||
| // Defines sigaction on msys:
 | ||||
| #ifndef _GNU_SOURCE | ||||
| #define _GNU_SOURCE | ||||
| #endif | ||||
| 
 | ||||
| #include "third_party/ggml/common.h" | ||||
| #include "libc/nexgen32e/x86feature.h" | ||||
| #include "third_party/ggml/llama.h" | ||||
| 
 | ||||
| #include "third_party/libcxx/cassert" | ||||
| #include "third_party/libcxx/cinttypes" | ||||
| #include "third_party/libcxx/cmath" | ||||
| #include "third_party/libcxx/cstdio" | ||||
| #include "third_party/libcxx/cstring" | ||||
| #include "third_party/libcxx/ctime" | ||||
| #include "third_party/libcxx/fstream" | ||||
| #include "third_party/libcxx/iostream" | ||||
| #include "third_party/libcxx/string" | ||||
| #include "third_party/libcxx/vector" | ||||
| 
 | ||||
| #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) | ||||
| #include "libc/calls/calls.h" | ||||
| #include "libc/calls/sigtimedwait.h" | ||||
| #include "libc/calls/struct/sigaction.h" | ||||
| #include "libc/calls/struct/siginfo.h" | ||||
| #include "libc/sysv/consts/sa.h" | ||||
| #include "libc/sysv/consts/sicode.h" | ||||
| #include "libc/sysv/consts/ss.h" | ||||
| #include "libc/calls/calls.h" | ||||
| #include "libc/calls/weirdtypes.h" | ||||
| #include "libc/runtime/pathconf.h" | ||||
| #include "libc/runtime/runtime.h" | ||||
| #include "libc/runtime/sysconf.h" | ||||
| #include "libc/sysv/consts/f.h" | ||||
| #include "libc/sysv/consts/fileno.h" | ||||
| #include "libc/sysv/consts/o.h" | ||||
| #include "libc/sysv/consts/ok.h" | ||||
| #include "libc/time/time.h" | ||||
| #include "third_party/getopt/getopt.h" | ||||
| #include "third_party/musl/crypt.h" | ||||
| #include "third_party/musl/lockf.h" | ||||
| #elif defined (_WIN32) | ||||
| #include "libc/calls/calls.h" | ||||
| #include "libc/calls/sigtimedwait.h" | ||||
| #include "libc/calls/struct/sigaction.h" | ||||
| #include "libc/calls/struct/siginfo.h" | ||||
| #include "libc/sysv/consts/sa.h" | ||||
| #include "libc/sysv/consts/sicode.h" | ||||
| #include "libc/sysv/consts/ss.h" | ||||
| #endif | ||||
| 
 | ||||
| static console_state con_st; | ||||
| static llama_context ** g_ctx; | ||||
| 
 | ||||
| static bool is_interacting = false; | ||||
| 
 | ||||
| #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) | ||||
| void sigint_handler(int signo) { | ||||
|     set_console_color(con_st, CONSOLE_COLOR_DEFAULT); | ||||
|     printf("\n"); // this also force flush stdout.
 | ||||
|     if (signo == SIGINT) { | ||||
|         if (!is_interacting) { | ||||
|             is_interacting=true; | ||||
|         } else { | ||||
|             llama_print_timings(*g_ctx); | ||||
|             _exit(130); | ||||
|         } | ||||
|     } | ||||
| } | ||||
| #endif | ||||
| 
 | ||||
| static int on_missing_feature(const char *name) { | ||||
|     fprintf(stderr, "error: we require %s support in your microprocessor.\n", name); | ||||
|     return 1; | ||||
| } | ||||
| 
 | ||||
| int main(int argc, char ** argv) { | ||||
|     gpt_params params; | ||||
|     params.model = "models/llama-7B/ggml-model.bin"; | ||||
| 
 | ||||
|     if (!X86_HAVE(AVX2)) return on_missing_feature("avx2"); | ||||
|     if (!X86_HAVE(AVX)) return on_missing_feature("avx"); | ||||
|     if (!X86_HAVE(FMA)) return on_missing_feature("fma"); | ||||
|     if (!X86_HAVE(F16C)) return on_missing_feature("f16c"); | ||||
|     if (!X86_HAVE(SSE3)) return on_missing_feature("sse3"); | ||||
| 
 | ||||
|     if (gpt_params_parse(argc, argv, params) == false) { | ||||
|         return 1; | ||||
|     } | ||||
| 
 | ||||
|     // save choice to use color for later
 | ||||
|     // (note for later: this is a slightly awkward choice)
 | ||||
|     con_st.use_color = params.use_color; | ||||
| 
 | ||||
| #if defined (_WIN32) | ||||
|     win32_console_init(params.use_color); | ||||
| #endif | ||||
| 
 | ||||
|     if (params.perplexity) { | ||||
|         printf("\n************\n"); | ||||
|         printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); | ||||
|         printf("************\n\n"); | ||||
| 
 | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     if (params.embedding) { | ||||
|         printf("\n************\n"); | ||||
|         printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); | ||||
|         printf("************\n\n"); | ||||
| 
 | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     if (params.n_ctx > 2048) { | ||||
|         fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" | ||||
|                 "expect poor results\n", __func__, params.n_ctx); | ||||
|     } | ||||
| 
 | ||||
|     if (params.seed <= 0) { | ||||
|         params.seed = time(NULL); | ||||
|     } | ||||
| 
 | ||||
|     fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); | ||||
| 
 | ||||
|     std::mt19937 rng(params.seed); | ||||
|     if (params.random_prompt) { | ||||
|         params.prompt = gpt_random_prompt(rng); | ||||
|     } | ||||
| 
 | ||||
| //    params.prompt = R"(// this function checks if the number n is prime
 | ||||
| //bool is_prime(int n) {)";
 | ||||
| 
 | ||||
|     llama_context * ctx; | ||||
|     g_ctx = &ctx; | ||||
| 
 | ||||
|     // load the model
 | ||||
|     { | ||||
|         auto lparams = llama_context_default_params(); | ||||
| 
 | ||||
|         lparams.n_ctx      = params.n_ctx; | ||||
|         lparams.n_parts    = params.n_parts; | ||||
|         lparams.seed       = params.seed; | ||||
|         lparams.f16_kv     = params.memory_f16; | ||||
|         lparams.use_mmap   = params.use_mmap; | ||||
|         lparams.use_mlock  = params.use_mlock; | ||||
| 
 | ||||
|         ctx = llama_init_from_file(params.model.c_str(), lparams); | ||||
| 
 | ||||
|         if (ctx == NULL) { | ||||
|             fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); | ||||
|             return 1; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     if (!params.lora_adapter.empty()) { | ||||
|         int err = llama_apply_lora_from_file(ctx, | ||||
|                                              params.lora_adapter.c_str(), | ||||
|                                              params.lora_base.empty() ? NULL : params.lora_base.c_str(), | ||||
|                                              params.n_threads); | ||||
|         if (err != 0) { | ||||
|             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); | ||||
|             return 1; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     // print system information
 | ||||
|     { | ||||
|         fprintf(stderr, "\n"); | ||||
|         fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", | ||||
|                 params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); | ||||
|     } | ||||
| 
 | ||||
|     // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
 | ||||
|     // uncomment the "used_mem" line in llama.cpp to see the results
 | ||||
|     if (params.mem_test) { | ||||
|         { | ||||
|             const std::vector<llama_token> tmp(params.n_batch, 0); | ||||
|             llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); | ||||
|         } | ||||
| 
 | ||||
|         { | ||||
|             const std::vector<llama_token> tmp = { 0, }; | ||||
|             llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads); | ||||
|         } | ||||
| 
 | ||||
|         llama_print_timings(ctx); | ||||
|         llama_free(ctx); | ||||
| 
 | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     // Add a space in front of the first character to match OG llama tokenizer behavior
 | ||||
|     params.prompt.insert(0, 1, ' '); | ||||
| 
 | ||||
|     // tokenize the prompt
 | ||||
|     auto embd_inp = ::llama_tokenize(ctx, params.prompt, true); | ||||
| 
 | ||||
|     const int n_ctx = llama_n_ctx(ctx); | ||||
| 
 | ||||
|     if ((int) embd_inp.size() > n_ctx - 4) { | ||||
|         fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); | ||||
|         return 1; | ||||
|     } | ||||
| 
 | ||||
|     // number of tokens to keep when resetting context
 | ||||
|     if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) { | ||||
|         params.n_keep = (int)embd_inp.size(); | ||||
|     } | ||||
| 
 | ||||
|     // prefix & suffix for instruct mode
 | ||||
|     const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); | ||||
|     const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); | ||||
| 
 | ||||
|     // in instruct mode, we inject a prefix and a suffix to each input by the user
 | ||||
|     if (params.instruct) { | ||||
|         params.interactive_first = true; | ||||
|         params.antiprompt.push_back("### Instruction:\n\n"); | ||||
|     } | ||||
| 
 | ||||
|     // enable interactive mode if reverse prompt or interactive start is specified
 | ||||
|     if (params.antiprompt.size() != 0 || params.interactive_first) { | ||||
|         params.interactive = true; | ||||
|     } | ||||
| 
 | ||||
|     // determine newline token
 | ||||
|     auto llama_token_newline = ::llama_tokenize(ctx, "\n", false); | ||||
| 
 | ||||
|     if (params.verbose_prompt) { | ||||
|         fprintf(stderr, "\n"); | ||||
|         fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str()); | ||||
|         fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); | ||||
|         for (int i = 0; i < (int) embd_inp.size(); i++) { | ||||
|             fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); | ||||
|         } | ||||
|         if (params.n_keep > 0) { | ||||
|         fprintf(stderr, "%s: static prompt based on n_keep: '", __func__); | ||||
|             for (int i = 0; i < params.n_keep; i++) { | ||||
|                 fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i])); | ||||
|             } | ||||
|             fprintf(stderr, "'\n"); | ||||
|         } | ||||
|         fprintf(stderr, "\n"); | ||||
|     } | ||||
| 
 | ||||
|     if (params.interactive) { | ||||
| #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) | ||||
|         struct sigaction sigint_action; | ||||
|         sigint_action.sa_handler = sigint_handler; | ||||
|         sigemptyset (&sigint_action.sa_mask); | ||||
|         sigint_action.sa_flags = 0; | ||||
|         sigaction(SIGINT, &sigint_action, NULL); | ||||
| #elif defined (_WIN32) | ||||
|         signal(SIGINT, sigint_handler); | ||||
| #endif | ||||
| 
 | ||||
|         fprintf(stderr, "%s: interactive mode on.\n", __func__); | ||||
| 
 | ||||
|         if (params.antiprompt.size()) { | ||||
|             for (auto antiprompt : params.antiprompt) { | ||||
|                 fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str()); | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         if (!params.input_prefix.empty()) { | ||||
|             fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); | ||||
|         } | ||||
|     } | ||||
|     fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", | ||||
|         params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); | ||||
|     fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); | ||||
|     fprintf(stderr, "\n\n"); | ||||
| 
 | ||||
|     // TODO: replace with ring-buffer
 | ||||
|     std::vector<llama_token> last_n_tokens(n_ctx); | ||||
|     std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); | ||||
| 
 | ||||
|     if (params.interactive) { | ||||
|         fprintf(stderr, "== Running in interactive mode. ==\n" | ||||
| #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) | ||||
|                " - Press Ctrl+C to interject at any time.\n" | ||||
| #endif | ||||
|                " - Press Return to return control to LLaMa.\n" | ||||
|                " - If you want to submit another line, end your input in '\\'.\n\n"); | ||||
|         is_interacting = params.interactive_first; | ||||
|     } | ||||
| 
 | ||||
|     bool is_antiprompt = false; | ||||
|     bool input_noecho  = false; | ||||
| 
 | ||||
|     int n_past     = 0; | ||||
|     int n_remain   = params.n_predict; | ||||
|     int n_consumed = 0; | ||||
| 
 | ||||
|     // the first thing we will do is to output the prompt, so set color accordingly
 | ||||
|     set_console_color(con_st, CONSOLE_COLOR_PROMPT); | ||||
| 
 | ||||
|     std::vector<llama_token> embd; | ||||
| 
 | ||||
|     while (n_remain != 0 || params.interactive) { | ||||
|         // predict
 | ||||
|         if (embd.size() > 0) { | ||||
|             // infinite text generation via context swapping
 | ||||
|             // if we run out of context:
 | ||||
|             // - take the n_keep first tokens from the original prompt (via n_past)
 | ||||
|             // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
 | ||||
|             if (n_past + (int) embd.size() > n_ctx) { | ||||
|                 const int n_left = n_past - params.n_keep; | ||||
| 
 | ||||
|                 n_past = params.n_keep; | ||||
| 
 | ||||
|                 // insert n_left/2 tokens at the start of embd from last_n_tokens
 | ||||
|                 embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); | ||||
| 
 | ||||
|                 //printf("\n---\n");
 | ||||
|                 //printf("resetting: '");
 | ||||
|                 //for (int i = 0; i < (int) embd.size(); i++) {
 | ||||
|                 //    printf("%s", llama_token_to_str(ctx, embd[i]));
 | ||||
|                 //}
 | ||||
|                 //printf("'\n");
 | ||||
|                 //printf("\n---\n");
 | ||||
|             } | ||||
| 
 | ||||
|             // evaluate tokens in batches
 | ||||
|             // embd is typically prepared beforehand to fit within a batch, but not always
 | ||||
|             for (int i = 0; i < (int) embd.size(); i += params.n_batch) { | ||||
|                 int n_eval = (int) embd.size() - i; | ||||
|                 if (n_eval > params.n_batch) { | ||||
|                     n_eval = params.n_batch; | ||||
|                 } | ||||
|                 if (llama_eval(ctx, &embd[i], n_eval, n_past, params.n_threads)) { | ||||
|                     fprintf(stderr, "%s : failed to eval\n", __func__); | ||||
|                     return 1; | ||||
|                 } | ||||
|                 n_past += n_eval; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         embd.clear(); | ||||
| 
 | ||||
|         if ((int) embd_inp.size() <= n_consumed && !is_interacting) { | ||||
|             // out of user input, sample next token
 | ||||
|             const int32_t top_k          = params.top_k; | ||||
|             const float   top_p          = params.top_p; | ||||
|             const float   temp           = params.temp; | ||||
|             const float   repeat_penalty = params.repeat_penalty; | ||||
| 
 | ||||
|             llama_token id = 0; | ||||
| 
 | ||||
|             { | ||||
|                 auto logits = llama_get_logits(ctx); | ||||
| 
 | ||||
|                 if (params.ignore_eos) { | ||||
|                     logits[llama_token_eos()] = 0; | ||||
|                 } | ||||
| 
 | ||||
|                 id = llama_sample_top_p_top_k(ctx, | ||||
|                         last_n_tokens.data() + n_ctx - params.repeat_last_n, | ||||
|                         params.repeat_last_n, top_k, top_p, temp, repeat_penalty); | ||||
| 
 | ||||
|                 last_n_tokens.erase(last_n_tokens.begin()); | ||||
|                 last_n_tokens.push_back(id); | ||||
|             } | ||||
| 
 | ||||
|             // replace end of text token with newline token when in interactive mode
 | ||||
|             if (id == llama_token_eos() && params.interactive && !params.instruct) { | ||||
|                 id = llama_token_newline.front(); | ||||
|                 if (params.antiprompt.size() != 0) { | ||||
|                     // tokenize and inject first reverse prompt
 | ||||
|                     const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false); | ||||
|                     embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             // add it to the context
 | ||||
|             embd.push_back(id); | ||||
| 
 | ||||
|             // echo this to console
 | ||||
|             input_noecho = false; | ||||
| 
 | ||||
|             // decrement remaining sampling budget
 | ||||
|             --n_remain; | ||||
|         } else { | ||||
|             // some user input remains from prompt or interaction, forward it to processing
 | ||||
|             while ((int) embd_inp.size() > n_consumed) { | ||||
|                 embd.push_back(embd_inp[n_consumed]); | ||||
|                 last_n_tokens.erase(last_n_tokens.begin()); | ||||
|                 last_n_tokens.push_back(embd_inp[n_consumed]); | ||||
|                 ++n_consumed; | ||||
|                 if ((int) embd.size() >= params.n_batch) { | ||||
|                     break; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         // display text
 | ||||
|         if (!input_noecho) { | ||||
|             for (auto id : embd) { | ||||
|                 printf("%s", llama_token_to_str(ctx, id)); | ||||
|             } | ||||
|             fflush(stdout); | ||||
|         } | ||||
|         // reset color to default if we there is no pending user input
 | ||||
|         if (!input_noecho && (int)embd_inp.size() == n_consumed) { | ||||
|             set_console_color(con_st, CONSOLE_COLOR_DEFAULT); | ||||
|         } | ||||
| 
 | ||||
|         // in interactive mode, and not currently processing queued inputs;
 | ||||
|         // check if we should prompt the user for more
 | ||||
|         if (params.interactive && (int) embd_inp.size() <= n_consumed) { | ||||
| 
 | ||||
|             // check for reverse prompt
 | ||||
|             if (params.antiprompt.size()) { | ||||
|                 std::string last_output; | ||||
|                 for (auto id : last_n_tokens) { | ||||
|                     last_output += llama_token_to_str(ctx, id); | ||||
|                 } | ||||
| 
 | ||||
|                 is_antiprompt = false; | ||||
|                 // Check if each of the reverse prompts appears at the end of the output.
 | ||||
|                 for (std::string & antiprompt : params.antiprompt) { | ||||
|                     if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) { | ||||
|                         is_interacting = true; | ||||
|                         is_antiprompt = true; | ||||
|                         set_console_color(con_st, CONSOLE_COLOR_USER_INPUT); | ||||
|                         fflush(stdout); | ||||
|                         break; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
| 
 | ||||
|             if (n_past > 0 && is_interacting) { | ||||
|                 // potentially set color to indicate we are taking user input
 | ||||
|                 set_console_color(con_st, CONSOLE_COLOR_USER_INPUT); | ||||
| 
 | ||||
| #if defined (_WIN32) | ||||
|                 // Windows: must reactivate sigint handler after each signal
 | ||||
|                 signal(SIGINT, sigint_handler); | ||||
| #endif | ||||
| 
 | ||||
|                 if (params.instruct) { | ||||
|                     printf("\n> "); | ||||
|                 } | ||||
| 
 | ||||
|                 std::string buffer; | ||||
|                 if (!params.input_prefix.empty()) { | ||||
|                     buffer += params.input_prefix; | ||||
|                     printf("%s", buffer.c_str()); | ||||
|                 } | ||||
| 
 | ||||
|                 std::string line; | ||||
|                 bool another_line = true; | ||||
|                 do { | ||||
| #if defined(_WIN32) | ||||
|                     std::wstring wline; | ||||
|                     if (!std::getline(std::wcin, wline)) { | ||||
|                         // input stream is bad or EOF received
 | ||||
|                         return 0; | ||||
|                     } | ||||
|                     win32_utf8_encode(wline, line); | ||||
| #else | ||||
|                     if (!std::getline(std::cin, line)) { | ||||
|                         // input stream is bad or EOF received
 | ||||
|                         return 0; | ||||
|                     } | ||||
| #endif | ||||
|                     if (line.empty() || line.back() != '\\') { | ||||
|                         another_line = false; | ||||
|                     } else { | ||||
|                         line.pop_back(); // Remove the continue character
 | ||||
|                     } | ||||
|                     buffer += line + '\n'; // Append the line to the result
 | ||||
|                 } while (another_line); | ||||
| 
 | ||||
|                 // done taking input, reset color
 | ||||
|                 set_console_color(con_st, CONSOLE_COLOR_DEFAULT); | ||||
| 
 | ||||
|                 // Add tokens to embd only if the input buffer is non-empty
 | ||||
|                 // Entering a empty line lets the user pass control back
 | ||||
|                 if (buffer.length() > 1) { | ||||
| 
 | ||||
|                     // instruct mode: insert instruction prefix
 | ||||
|                     if (params.instruct && !is_antiprompt) { | ||||
|                         n_consumed = embd_inp.size(); | ||||
|                         embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); | ||||
|                     } | ||||
| 
 | ||||
|                     auto line_inp = ::llama_tokenize(ctx, buffer, false); | ||||
|                     embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); | ||||
| 
 | ||||
|                     // instruct mode: insert response suffix
 | ||||
|                     if (params.instruct) { | ||||
|                         embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); | ||||
|                     } | ||||
| 
 | ||||
|                     n_remain -= line_inp.size(); | ||||
|                 } | ||||
| 
 | ||||
|                 input_noecho = true; // do not echo this again
 | ||||
|             } | ||||
| 
 | ||||
|             if (n_past > 0) { | ||||
|                 is_interacting = false; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         // end of text token
 | ||||
|         if (!embd.empty() && embd.back() == llama_token_eos()) { | ||||
|             if (params.instruct) { | ||||
|                 is_interacting = true; | ||||
|             } else { | ||||
|                 fprintf(stderr, " [end of text]\n"); | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
| 
 | ||||
|         // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
 | ||||
|         if (params.interactive && n_remain <= 0 && params.n_predict != -1) { | ||||
|             n_remain = params.n_predict; | ||||
|             is_interacting = true; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
| #if defined (_WIN32) | ||||
|     signal(SIGINT, SIG_DFL); | ||||
| #endif | ||||
| 
 | ||||
|     llama_print_timings(ctx); | ||||
|     llama_free(ctx); | ||||
| 
 | ||||
|     set_console_color(con_st, CONSOLE_COLOR_DEFAULT); | ||||
| 
 | ||||
|     return 0; | ||||
| } | ||||
							
								
								
									
										4
									
								
								third_party/libcxx/__hash_table
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								third_party/libcxx/__hash_table
									
										
									
									
										vendored
									
									
								
							|  | @ -1066,7 +1066,7 @@ public: | |||
| 
 | ||||
| #ifndef _LIBCPP_CXX03_LANG | ||||
|     template <class _Key, class ..._Args> | ||||
|     _LIBCPP_INLINE_VISIBILITY | ||||
|     inline _LIBCPP_INLINE_VISIBILITY | ||||
|     pair<iterator, bool> __emplace_unique_key_args(_Key const& __k, _Args&&... __args); | ||||
| 
 | ||||
|     template <class... _Args> | ||||
|  | @ -2104,7 +2104,7 @@ __hash_table<_Tp, _Hash, _Equal, _Alloc>::__emplace_unique_key_args(_Key const& | |||
|     size_type __bc = bucket_count(); | ||||
|     bool __inserted = false; | ||||
|     __next_pointer __nd; | ||||
|     size_t __chash; | ||||
|     size_t __chash = 0; | ||||
|     if (__bc != 0) | ||||
|     { | ||||
|         __chash = __constrain_hash(__hash, __bc); | ||||
|  |  | |||
							
								
								
									
										1
									
								
								third_party/third_party.mk
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								third_party/third_party.mk
									
										
									
									
										vendored
									
									
								
							|  | @ -13,6 +13,7 @@ o/$(MODE)/third_party:				\ | |||
| 	o/$(MODE)/third_party/finger		\
 | ||||
| 	o/$(MODE)/third_party/gdtoa		\
 | ||||
| 	o/$(MODE)/third_party/getopt		\
 | ||||
| 	o/$(MODE)/third_party/ggml		\
 | ||||
| 	o/$(MODE)/third_party/hiredis		\
 | ||||
| 	o/$(MODE)/third_party/libcxx		\
 | ||||
| 	o/$(MODE)/third_party/linenoise		\
 | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue