diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 446b8e8fb..9f9ed9db0 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -1,6 +1,7 @@ -#include #include "ggml.h" #include "build-info.h" + +#include #include #include #include diff --git a/examples/common.cpp b/examples/common.cpp index e89df537e..1308f8410 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -578,6 +578,37 @@ void console_set_color(console_state & con_st, console_color_t color) { } char32_t getchar32() { +#if defined(_WIN32) + HANDLE hConsole = GetStdHandle(STD_INPUT_HANDLE); + wchar_t high_surrogate = 0; + + while (true) { + INPUT_RECORD record; + DWORD count; + if (!ReadConsoleInputW(hConsole, &record, 1, &count) || count == 0) { + return WEOF; + } + + if (record.EventType == KEY_EVENT && record.Event.KeyEvent.bKeyDown) { + wchar_t wc = record.Event.KeyEvent.uChar.UnicodeChar; + if (wc == 0) { + continue; + } + + if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate + high_surrogate = wc; + continue; + } else if ((wc >= 0xDC00) && (wc <= 0xDFFF)) { // Check if wc is a low surrogate + if (high_surrogate != 0) { // Check if we have a high surrogate + return ((high_surrogate - 0xD800) << 10) + (wc - 0xDC00) + 0x10000; + } + } + + high_surrogate = 0; // Reset the high surrogate + return static_cast(wc); + } + } +#else wchar_t wc = getwchar(); if (static_cast(wc) == WEOF) { return WEOF; @@ -596,6 +627,7 @@ char32_t getchar32() { #endif return static_cast(wc); +#endif } void pop_cursor(console_state & con_st) { diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index c24f7f820..03603b10f 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -31,6 +31,8 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } + llama_init_backend(); + llama_context * ctx; // load the model diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 4d886f8de..47b418d97 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -96,8 +96,7 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } -// params.prompt = R"(// this function checks if the number n is prime -//bool is_prime(int n) {)"; + llama_init_backend(); llama_context * ctx; g_ctx = &ctx; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 9d38626cb..e19c6825f 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -143,6 +143,8 @@ int main(int argc, char ** argv) { params.prompt = gpt_random_prompt(rng); } + llama_init_backend(); + llama_context * ctx; // load the model and apply lora adapter, if any diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 115d8fb1b..769dd36a4 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -1,7 +1,7 @@ -#include "ggml.h" -#include "llama.h" #include "build-info.h" +#include "llama.h" + #include #include #include @@ -42,8 +42,6 @@ bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::st // ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads] // int main(int argc, char ** argv) { - ggml_time_init(); - if (argc < 3) { fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]); for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) { @@ -52,12 +50,7 @@ int main(int argc, char ** argv) { return 1; } - // needed to initialize f16 tables - { - struct ggml_init_params params = { 0, NULL, false }; - struct ggml_context * ctx = ggml_init(params); - ggml_free(ctx); - } + llama_init_backend(); // parse command line arguments const std::string fname_inp = argv[1]; @@ -116,25 +109,25 @@ int main(int argc, char ** argv) { } fprintf(stderr, "\n"); - const int64_t t_main_start_us = ggml_time_us(); + const int64_t t_main_start_us = llama_time_us(); int64_t t_quantize_us = 0; // load the model { - const int64_t t_start_us = ggml_time_us(); + const int64_t t_start_us = llama_time_us(); if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } - t_quantize_us = ggml_time_us() - t_start_us; + t_quantize_us = llama_time_us() - t_start_us; } // report timing { - const int64_t t_main_end_us = ggml_time_us(); + const int64_t t_main_end_us = llama_time_us(); printf("\n"); printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0); diff --git a/ggml.c b/ggml.c index 1cb89636a..939ab4d62 100644 --- a/ggml.c +++ b/ggml.c @@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) { return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); } -#if __AVX2__ || __AVX512F__ +#if defined(__AVX2__) || defined(__AVX512F__) // spread 32 bits to 32 bytes { 0x00, 0xFF } static inline __m256i bytes_from_bits_32(const uint8_t * x) { uint32_t x32; @@ -688,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #endif // __AVX__ || __AVX2__ || __AVX512F__ #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) -#if __ARM_NEON +#if defined(__ARM_NEON) #if !defined(__aarch64__) @@ -2481,7 +2481,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } - sumf += (GGML_FP16_TO_FP32(x[i]).d*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; } *s = sumf; diff --git a/llama-util.h b/llama-util.h index 88ec28dca..a79c5dadd 100644 --- a/llama-util.h +++ b/llama-util.h @@ -101,12 +101,12 @@ struct llama_file { LLAMA_ASSERT(ret == 0); // same } - void read_raw(void * ptr, size_t size) { - if (size == 0) { + void read_raw(void * ptr, size_t len) const { + if (len == 0) { return; } errno = 0; - std::size_t ret = std::fread(ptr, size, 1, fp); + std::size_t ret = std::fread(ptr, len, 1, fp); if (ferror(fp)) { throw std::runtime_error(format("read error: %s", strerror(errno))); } @@ -127,12 +127,12 @@ struct llama_file { return std::string(chars.data(), len); } - void write_raw(const void * ptr, size_t size) { - if (size == 0) { + void write_raw(const void * ptr, size_t len) const { + if (len == 0) { return; } errno = 0; - size_t ret = std::fwrite(ptr, size, 1, fp); + size_t ret = std::fwrite(ptr, len, 1, fp); if (ret != 1) { throw std::runtime_error(format("write error: %s", strerror(errno))); } @@ -267,9 +267,9 @@ struct llama_mlock { } } - void init(void * addr) { - LLAMA_ASSERT(this->addr == NULL && this->size == 0); - this->addr = addr; + void init(void * ptr) { + LLAMA_ASSERT(addr == NULL && size == 0); + addr = ptr; } void grow_to(size_t target_size) { @@ -340,14 +340,14 @@ struct llama_mlock { return (size_t) si.dwPageSize; } - bool raw_lock(void * addr, size_t size) { + bool raw_lock(void * ptr, size_t len) { for (int tries = 1; ; tries++) { - if (VirtualLock(addr, size)) { + if (VirtualLock(ptr, len)) { return true; } if (tries == 2) { fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", - size, this->size, llama_format_win_err(GetLastError()).c_str()); + len, size, llama_format_win_err(GetLastError()).c_str()); return false; } @@ -363,7 +363,7 @@ struct llama_mlock { // is equal to the number of pages in its minimum working set minus // a small overhead." // Hopefully a megabyte is enough overhead: - size_t increment = size + 1048576; + size_t increment = len + 1048576; // The minimum must be <= the maximum, so we need to increase both: min_ws_size += increment; max_ws_size += increment; @@ -375,8 +375,8 @@ struct llama_mlock { } } - void raw_unlock(void * addr, size_t size) { - if (!VirtualUnlock(addr, size)) { + void raw_unlock(void * ptr, size_t len) { + if (!VirtualUnlock(ptr, len)) { fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", llama_format_win_err(GetLastError()).c_str()); } @@ -388,12 +388,12 @@ struct llama_mlock { return (size_t) 65536; } - bool raw_lock(const void * addr, size_t size) { + bool raw_lock(const void * addr, size_t len) { fprintf(stderr, "warning: mlock not supported on this system\n"); return false; } - void raw_unlock(const void * addr, size_t size) {} + void raw_unlock(const void * addr, size_t len) {} #endif }; @@ -404,10 +404,10 @@ struct llama_buffer { llama_buffer() = default; - void resize(size_t size) { + void resize(size_t len) { delete[] addr; - addr = new uint8_t[size]; - this->size = size; + addr = new uint8_t[len]; + size = len; } ~llama_buffer() { diff --git a/llama.cpp b/llama.cpp index 6ebe85d0f..5e6980b48 100644 --- a/llama.cpp +++ b/llama.cpp @@ -45,6 +45,7 @@ enum e_model { MODEL_65B, }; + static const size_t MB = 1024*1024; // computed for n_ctx == 2048 @@ -110,7 +111,7 @@ struct llama_hparams { enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; bool operator!=(const llama_hparams & other) const { - return memcmp(this, &other, sizeof(llama_hparams)); + return static_cast(memcmp(this, &other, sizeof(llama_hparams))); } }; @@ -502,7 +503,7 @@ struct llama_file_loader { if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) { // skip to the next multiple of 32 bytes - file.seek(-file.tell() & 31, SEEK_CUR); + file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); } shard.file_idx = file_idx; shard.file_off = file.tell(); @@ -577,7 +578,7 @@ struct llama_file_saver { file.write_u32(new_type); file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size()); file.write_raw(tensor.name.data(), tensor.name.size()); - file.seek(-file.tell() & 31, SEEK_CUR); + file.seek(-static_cast(file.tell()) & 31, SEEK_CUR); LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type)); file.write_raw(new_data, new_size); } @@ -838,6 +839,21 @@ bool llama_mlock_supported() { return llama_mlock::SUPPORTED; } +void llama_init_backend() { + ggml_time_init(); + + // needed to initialize f16 tables + { + struct ggml_init_params params = { 0, NULL, false }; + struct ggml_context * ctx = ggml_init(params); + ggml_free(ctx); + } +} + +int64_t llama_time_us() { + return ggml_time_us(); +} + // // model loading // @@ -2618,8 +2634,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { } // Sets the state reading from the specified source address -size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { - const uint8_t * inp = src; +size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { + uint8_t * inp = src; // set rng { diff --git a/llama.h b/llama.h index fd3f21e5f..0a63d034b 100644 --- a/llama.h +++ b/llama.h @@ -40,9 +40,9 @@ extern "C" { typedef int llama_token; typedef struct llama_token_data { - llama_token id; // token id - float logit; // log-odds of the token - float p; // probability of the token + llama_token id; // token id + float logit; // log-odds of the token + float p; // probability of the token } llama_token_data; typedef struct llama_token_data_array { @@ -73,16 +73,16 @@ extern "C" { // model file types enum llama_ftype { - LLAMA_FTYPE_ALL_F32 = 0, - LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + LLAMA_FTYPE_ALL_F32 = 0, + LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed - // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed - LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed + // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed + LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors }; LLAMA_API struct llama_context_params llama_context_default_params(); @@ -90,6 +90,13 @@ extern "C" { LLAMA_API bool llama_mmap_supported(); LLAMA_API bool llama_mlock_supported(); + // TODO: not great API - very likely to change + // Initialize the llama + ggml backend + // Call once at the start of the program + LLAMA_API void llama_init_backend(); + + LLAMA_API int64_t llama_time_us(); + // Various functions for loading a ggml llama model. // Allocate (almost) all memory needed for the model. // Return NULL on failure @@ -138,7 +145,7 @@ extern "C" { // Set the state reading from the specified address // Returns the number of bytes read - LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src); + LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); // Save/load session file LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);