From 90b19bd6eee943832584f9cac0b6f9ea29cc42a4 Mon Sep 17 00:00:00 2001 From: Alex Klinkhamer Date: Mon, 1 May 2023 00:24:20 -0700 Subject: [PATCH 1/7] llama : let context be const when accessing const data (#1261) --- llama.cpp | 12 ++++++------ llama.h | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/llama.cpp b/llama.cpp index f8b4c8e46..3d82113a0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2373,7 +2373,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor } } -int llama_get_kv_cache_token_count(struct llama_context * ctx) { +int llama_get_kv_cache_token_count(const struct llama_context * ctx) { return ctx->model.kv_self.n; } @@ -2387,7 +2387,7 @@ void llama_set_rng_seed(struct llama_context * ctx, int seed) { } // Returns the size of the state -size_t llama_get_state_size(struct llama_context * ctx) { +size_t llama_get_state_size(const struct llama_context * ctx) { // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. // for reference, std::mt19937(1337) serializes to 6701 bytes. const size_t s_rng_size = sizeof(size_t); @@ -2605,15 +2605,15 @@ int llama_tokenize( return res.size(); } -int llama_n_vocab(struct llama_context * ctx) { +int llama_n_vocab(const struct llama_context * ctx) { return ctx->vocab.id_to_token.size(); } -int llama_n_ctx(struct llama_context * ctx) { +int llama_n_ctx(const struct llama_context * ctx) { return ctx->model.hparams.n_ctx; } -int llama_n_embd(struct llama_context * ctx) { +int llama_n_embd(const struct llama_context * ctx) { return ctx->model.hparams.n_embd; } @@ -2625,7 +2625,7 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } -const char * llama_token_to_str(struct llama_context * ctx, llama_token token) { +const char * llama_token_to_str(const struct llama_context * ctx, llama_token token) { if (token >= llama_n_vocab(ctx)) { return nullptr; } diff --git a/llama.h b/llama.h index 34a8f5b3c..9fbba7643 100644 --- a/llama.h +++ b/llama.h @@ -120,13 +120,13 @@ extern "C" { int n_threads); // Returns the number of tokens in the KV cache - LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx); + LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); // Sets the current rng seed. LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); // Returns the size in bytes of the state (rng, logits, embedding and kv_cache) - LLAMA_API size_t llama_get_state_size(struct llama_context * ctx); + LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); // Copies the state to the specified destination address. // Destination needs to have allocated enough memory. @@ -164,9 +164,9 @@ extern "C" { int n_max_tokens, bool add_bos); - LLAMA_API int llama_n_vocab(struct llama_context * ctx); - LLAMA_API int llama_n_ctx (struct llama_context * ctx); - LLAMA_API int llama_n_embd (struct llama_context * ctx); + LLAMA_API int llama_n_vocab(const struct llama_context * ctx); + LLAMA_API int llama_n_ctx (const struct llama_context * ctx); + LLAMA_API int llama_n_embd (const struct llama_context * ctx); // Token logits obtained from the last call to llama_eval() // The logits for the last token are stored in the last row @@ -180,7 +180,7 @@ extern "C" { LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); // Token Id -> String. Uses the vocabulary in the provided context - LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token); + LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); // Special tokens LLAMA_API llama_token llama_token_bos(); From b925f1f1b082319ee69943f8d1a83ac9b6ff09ca Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Mon, 1 May 2023 13:32:22 +0200 Subject: [PATCH 2/7] cuBLAS: fall back to pageable memory if pinned alloc fails (#1233) * cuBLAS: fall back to pageable memory if pinned alloc fails * cuBLAS: do not use pinned memory if env variable GGML_CUDA_NO_PINNED is set --- ggml-cuda.cu | 14 ++++++++++++-- llama-util.h | 44 +++++++++++++++++++++++++++++++++++++++----- llama.cpp | 3 +-- 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 5a2701cfe..c1ec306f0 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -355,8 +355,18 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, } void * ggml_cuda_host_malloc(size_t size) { - void * ptr; - CUDA_CHECK(cudaMallocHost((void **) &ptr, size)); + if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { + return nullptr; + } + + void * ptr = nullptr; + cudaError_t err = cudaMallocHost((void **) &ptr, size); + if (err != cudaSuccess) { + fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", + size/1024.0/1024.0, cudaGetErrorString(err)); + return nullptr; + } + return ptr; } diff --git a/llama-util.h b/llama-util.h index ca4dd162f..5f9f70ecc 100644 --- a/llama-util.h +++ b/llama-util.h @@ -395,6 +395,8 @@ struct llama_buffer { uint8_t * addr = NULL; size_t size = 0; + llama_buffer() = default; + void resize(size_t size) { delete[] addr; addr = new uint8_t[size]; @@ -404,27 +406,59 @@ struct llama_buffer { ~llama_buffer() { delete[] addr; } + + // disable copy and move + llama_buffer(const llama_buffer&) = delete; + llama_buffer(llama_buffer&&) = delete; + llama_buffer& operator=(const llama_buffer&) = delete; + llama_buffer& operator=(llama_buffer&&) = delete; }; #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" struct llama_ctx_buffer { uint8_t * addr = NULL; + bool is_cuda; size_t size = 0; + llama_ctx_buffer() = default; + void resize(size_t size) { - if (addr) { - ggml_cuda_host_free(addr); - } + free(); + addr = (uint8_t *) ggml_cuda_host_malloc(size); + if (addr) { + is_cuda = true; + } + else { + // fall back to pageable memory + addr = new uint8_t[size]; + is_cuda = false; + } this->size = size; } - ~llama_ctx_buffer() { + void free() { if (addr) { - ggml_cuda_host_free(addr); + if (is_cuda) { + ggml_cuda_host_free(addr); + } + else { + delete[] addr; + } } + addr = NULL; } + + ~llama_ctx_buffer() { + free(); + } + + // disable copy and move + llama_ctx_buffer(const llama_ctx_buffer&) = delete; + llama_ctx_buffer(llama_ctx_buffer&&) = delete; + llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete; + llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete; }; #else typedef llama_buffer llama_ctx_buffer; diff --git a/llama.cpp b/llama.cpp index 3d82113a0..0d094a52f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -727,8 +727,7 @@ struct llama_model_loader { LLAMA_ASSERT(offset == lt.size); } else if (lt.split_type == SPLIT_BY_COLUMNS) { // Let's load the data into temporary buffers to ensure the OS performs large loads. - std::vector tmp_bufs; - tmp_bufs.resize(lt.shards.size()); + std::vector tmp_bufs(lt.shards.size()); for (size_t i = 0; i < lt.shards.size(); i++) { llama_load_tensor_shard & shard = lt.shards.at(i); llama_file & file = file_loaders.at(shard.file_idx)->file; From 70269cae37538461ff816e714afbb3ebcdcdc26b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 1 May 2023 14:54:59 +0300 Subject: [PATCH 3/7] llama : fix session load / save (#1263) --- examples/main/main.cpp | 20 +++---- llama.cpp | 133 ++++++++++++++++++++++++----------------- llama.h | 12 ++-- 3 files changed, 96 insertions(+), 69 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 990d0fa02..78fc9a197 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -161,23 +161,22 @@ int main(int argc, char ** argv) { std::vector session_tokens; if (!path_session.empty()) { - fprintf(stderr, "%s: attempting to load saved session from %s..\n", __func__, path_session.c_str()); + fprintf(stderr, "%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); - // REVIEW - fopen to check for existing session + // fopen to check for existing session FILE * fp = std::fopen(path_session.c_str(), "rb"); if (fp != NULL) { std::fclose(fp); session_tokens.resize(params.n_ctx); size_t n_token_count_out = 0; - const size_t n_session_bytes = llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out); + if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { + fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); + return 1; + } session_tokens.resize(n_token_count_out); - if (n_session_bytes > 0) { - fprintf(stderr, "%s: loaded %zu bytes of session data!\n", __func__, n_session_bytes); - } else { - fprintf(stderr, "%s: could not load session file, will recreate\n", __func__); - } + fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); } else { fprintf(stderr, "%s: session file does not exist, will create\n", __func__); } @@ -214,7 +213,7 @@ int main(int argc, char ** argv) { } // number of tokens to keep when resetting context - if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) { + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) { params.n_keep = (int)embd_inp.size(); } @@ -329,7 +328,7 @@ int main(int argc, char ** argv) { // insert n_left/2 tokens at the start of embd from last_n_tokens embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size()); - // REVIEW - stop saving session if we run out of context + // stop saving session if we run out of context path_session = ""; //printf("\n---\n"); @@ -355,6 +354,7 @@ int main(int argc, char ** argv) { n_session_consumed++; if (n_session_consumed >= (int) session_tokens.size()) { + ++i; break; } } diff --git a/llama.cpp b/llama.cpp index 0d094a52f..868a58a8b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2566,6 +2566,85 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { return nread; } +bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_file file(path_session, "rb"); + + // sanity checks + { + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); + + if (!(magic == LLAMA_SESSION_MAGIC && version == LLAMA_SESSION_VERSION)) { + fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); + return false; + } + + llama_hparams session_hparams; + file.read_raw(&session_hparams, sizeof(llama_hparams)); + + if (session_hparams != ctx->model.hparams) { + fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); + return false; + } + } + + // load the prompt + { + const uint32_t n_token_count = file.read_u32(); + + if (n_token_count > n_token_capacity) { + fprintf(stderr, "%s : token count in session file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + return false; + } + + file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + *n_token_count_out = n_token_count; + } + + // restore the context state + { + const size_t n_state_size_cur = file.size - file.tell(); + const size_t n_state_size_exp = llama_get_state_size(ctx); + + if (n_state_size_cur != n_state_size_exp) { + fprintf(stderr, "%s : the state size in session file didn't match! expected %zu, got %zu\n", __func__, n_state_size_exp, n_state_size_cur); + return false; + } + + std::vector state_data(n_state_size_cur); + file.read_raw(state_data.data(), n_state_size_cur); + + llama_set_state_data(ctx, state_data.data()); + } + + return true; +} + +bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + llama_file file(path_session, "wb"); + + file.write_u32(LLAMA_SESSION_MAGIC); + file.write_u32(LLAMA_SESSION_VERSION); + + file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); + + // save the prompt + file.write_u32((uint32_t) n_token_count); + file.write_raw(tokens, sizeof(llama_token) * n_token_count); + + // save the context state + { + const size_t n_state_size = llama_get_state_size(ctx); + + std::vector state_data(n_state_size); + llama_copy_state_data(ctx, state_data.data()); + + file.write_raw(state_data.data(), n_state_size); + } + + return true; +} + int llama_eval( struct llama_context * ctx, const llama_token * tokens, @@ -2693,57 +2772,3 @@ const char * llama_print_system_info(void) { std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { return ctx->model.tensors_by_name; } - -size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { - // TODO leverage mmap - llama_file file(path_session, "rb"); - const uint32_t magic = file.read_u32(); - const uint32_t version = file.read_u32(); - - if (!(magic == 'ggsn' && version == 0)) { - fprintf(stderr, "%s : unknown (magic, version) for session file: %08x, %08x\n", __func__, magic, version); - return 0; - } - - llama_hparams session_hparams; - file.read_raw(&session_hparams, sizeof(llama_hparams)); - - // REVIEW - if (session_hparams != ctx->model.hparams) { - fprintf(stderr, "%s : model hparams didn't match from session file!\n", __func__); - return 0; - } - - const uint32_t n_token_count = file.read_u32(); - LLAMA_ASSERT(n_token_capacity >= n_token_count); - file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); - *n_token_count_out = n_token_count; - - const size_t n_state_size = file.size - file.tell(); - const size_t n_orig_state_size = llama_get_state_size(ctx); - if (n_state_size != n_orig_state_size) { - fprintf(stderr, "%s : failed to validate state size\n", __func__); - } - std::unique_ptr state_data(new uint8_t[n_state_size]); - file.read_raw(state_data.get(), n_state_size); - return llama_set_state_data(ctx, state_data.get()); -} - -size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { - // TODO save temp & swap - llama_file file(path_session, "wb"); - - const size_t n_state_size = llama_get_state_size(ctx); - std::unique_ptr state_data(new uint8_t[n_state_size]); - llama_copy_state_data(ctx, state_data.get()); - - file.write_u32('ggsn'); // magic - file.write_u32(0); // version - file.write_raw(&ctx->model.hparams, sizeof(llama_hparams)); - - file.write_u32((uint32_t) n_token_count); // REVIEW - file.write_raw(tokens, sizeof(llama_token) * n_token_count); - - file.write_raw(state_data.get(), n_state_size); - return n_state_size; // REVIEW -} diff --git a/llama.h b/llama.h index 9fbba7643..2f6ce8d83 100644 --- a/llama.h +++ b/llama.h @@ -19,9 +19,11 @@ # define LLAMA_API #endif -#define LLAMA_FILE_VERSION 1 -#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex -#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files +#define LLAMA_FILE_VERSION 1 +#define LLAMA_FILE_MAGIC 'ggjt' +#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml' +#define LLAMA_SESSION_MAGIC 'ggsn' +#define LLAMA_SESSION_VERSION 0 #ifdef __cplusplus extern "C" { @@ -138,8 +140,8 @@ extern "C" { LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src); // Save/load session file - LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); - LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); + LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); + LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); // Run the llama inference to obtain the logits and probabilities for the next token. // tokens + n_tokens is the provided batch of new tokens to process From 2bdc09646d8c6cb74a6f573e9081586b4b83b9d1 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Mon, 1 May 2023 05:56:07 -0600 Subject: [PATCH 4/7] ggml : fix ggml_used_mem() (#1264) --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 8cc48344e..5b5ed925e 100644 --- a/ggml.c +++ b/ggml.c @@ -4411,7 +4411,7 @@ void ggml_free(struct ggml_context * ctx) { } size_t ggml_used_mem(const struct ggml_context * ctx) { - return ctx->objects_end->offs + ctx->objects_end->size; + return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size; } size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) { From ea3a0ad6b6b5ca4693b94acd4cb32e2803f66fae Mon Sep 17 00:00:00 2001 From: xloem <0xloem@gmail.com> Date: Mon, 1 May 2023 08:58:51 -0400 Subject: [PATCH 5/7] llama : update stubs for systems without mmap and mlock (#1266) Co-authored-by: John Doe --- llama-util.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/llama-util.h b/llama-util.h index 5f9f70ecc..d531588d5 100644 --- a/llama-util.h +++ b/llama-util.h @@ -243,7 +243,8 @@ struct llama_mmap { #else static constexpr bool SUPPORTED = false; - llama_mmap(struct llama_file *) { + llama_mmap(struct llama_file *, bool prefetch = true) { + (void)prefetch; throw std::string("mmap not supported"); } #endif @@ -382,8 +383,13 @@ struct llama_mlock { #else static constexpr bool SUPPORTED = false; - void raw_lock(const void * addr, size_t size) { + size_t lock_granularity() { + return (size_t) 65536; + } + + bool raw_lock(const void * addr, size_t size) { fprintf(stderr, "warning: mlock not supported on this system\n"); + return false; } void raw_unlock(const void * addr, size_t size) {} From 58b367c2d757c0ea12aec672382462b42204c724 Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Mon, 1 May 2023 18:11:07 +0200 Subject: [PATCH 6/7] cuBLAS: refactor and optimize f16 mat mul performance (#1259) * cuBLAS: refactor, convert fp16 to fp32 on device * cuBLAS: use multiple streams, choose smartly between mul_mat_q and mul_mat_f16 * fix build * cuBLAS: update block_q5_1 --- ggml-cuda.cu | 429 +++++++++++++++++++++++++++++++++++++++++++++------ ggml-cuda.h | 47 +----- ggml.c | 252 ++++++++++-------------------- ggml.h | 11 ++ 4 files changed, 480 insertions(+), 259 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c1ec306f0..e8a1e77cb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1,11 +1,38 @@ +#include +#include #include #include -#include #include -#include "ggml-cuda.h" -typedef uint16_t ggml_fp16_t; -static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size"); +#include +#include +#include + +#include "ggml-cuda.h" +#include "ggml.h" + +static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); + +#define CUDA_CHECK(err) \ + do { \ + cudaError_t err_ = (err); \ + if (err_ != cudaSuccess) { \ + fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ + cudaGetErrorString(err_)); \ + exit(1); \ + } \ + } while (0) + +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + exit(1); \ + } \ + } while (0) + +typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); #define QK4_0 32 typedef struct { @@ -24,14 +51,14 @@ static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 b #define QK4_2 16 typedef struct { - __half d; // delta + half d; // delta uint8_t qs[QK4_2 / 2]; // nibbles / quants } block_q4_2; static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding"); #define QK5_0 32 typedef struct { - __half d; // delta + half d; // delta uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_0 / 2]; // nibbles / quants } block_q5_0; @@ -39,9 +66,9 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5 #define QK5_1 32 typedef struct { - __half d; // delta - __half m; // min - uint32_t qh; // 5-th bit of quants + half d; // delta + half m; // min + uint8_t qh[4]; // 5-th bit of quants uint8_t qs[QK5_1 / 2]; // nibbles / quants } block_q5_1; static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding"); @@ -162,7 +189,8 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) { const uint8_t * pp = x[i].qs; - const uint32_t qh = x[i].qh; + uint32_t qh; + memcpy(&qh, x[i].qh, sizeof(qh)); for (int l = 0; l < QK5_1; l += 2) { const uint8_t vi = pp[l/2]; @@ -197,37 +225,50 @@ static __global__ void dequantize_block_q8_0(const void * vx, float * y) { } } -void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { +static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK4_0; dequantize_block_q4_0<<>>(vx, y); } -void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { +static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK4_1; dequantize_block_q4_1<<>>(vx, y); } -void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) { +static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK4_2; dequantize_block_q4_2<<>>(vx, y); } -void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { +static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK5_0; dequantize_block_q5_0<<>>(vx, y); } -void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { +static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK5_1; dequantize_block_q5_1<<>>(vx, y); } -void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { +static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) { const int nb = k / QK8_0; dequantize_block_q8_0<<>>(vx, y); } -dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) { +// TODO: optimize +static __global__ void convert_fp16_to_fp32(const void * vx, float * y) { + const half * x = (const half *) vx; + + const int i = blockIdx.x; + + y[i] = __half2float(x[i]); +} + +static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStream_t stream) { + convert_fp16_to_fp32<<>>(x, y); +} + +static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { switch (type) { case GGML_TYPE_Q4_0: return dequantize_row_q4_0_cuda; @@ -241,6 +282,8 @@ dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) { return dequantize_row_q5_1_cuda; case GGML_TYPE_Q8_0: return dequantize_row_q8_0_cuda; + case GGML_TYPE_F16: + return convert_fp16_to_fp32_cuda; default: return nullptr; } @@ -271,7 +314,7 @@ struct cuda_buffer { static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS]; static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; -void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { +static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { scoped_spin_lock lock(g_cuda_pool_lock); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { @@ -290,7 +333,7 @@ void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { return ptr; } -void ggml_cuda_pool_free(void * ptr, size_t size) { +static void ggml_cuda_pool_free(void * ptr, size_t size) { scoped_spin_lock lock(g_cuda_pool_lock); for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { @@ -305,28 +348,55 @@ void ggml_cuda_pool_free(void * ptr, size_t size) { CUDA_CHECK(cudaFree(ptr)); } -cublasHandle_t g_cublasH = nullptr; -cudaStream_t g_cudaStream = nullptr; -cudaStream_t g_cudaStream2 = nullptr; -cudaEvent_t g_cudaEvent = nullptr; +#define GGML_CUDA_MAX_STREAMS 8 +#define GGML_CUDA_MAX_EVENTS 64 +static cublasHandle_t g_cublasH = nullptr; +static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr }; +static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr }; +static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr }; void ggml_init_cublas() { if (g_cublasH == nullptr) { - // create cublas handle, bind a stream - CUBLAS_CHECK(cublasCreate(&g_cublasH)); - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking)); - CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream)); + // create streams + for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) { + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[i], cudaStreamNonBlocking)); + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams2[i], cudaStreamNonBlocking)); + } + // create events + for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) { + CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents[i], cudaEventDisableTiming)); + } - // create additional stream and event for synchronization - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking)); - CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming)); + // create cublas handle + CUBLAS_CHECK(cublasCreate(&g_cublasH)); + CUBLAS_CHECK(cublasSetMathMode(g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH)); // configure logging to stdout - // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL)); + // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); } } -cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) { +void * ggml_cuda_host_malloc(size_t size) { + if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { + return nullptr; + } + + void * ptr = nullptr; + cudaError_t err = cudaMallocHost((void **) &ptr, size); + if (err != cudaSuccess) { + fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", + size/1024.0/1024.0, cudaGetErrorString(err)); + return nullptr; + } + + return ptr; +} + +void ggml_cuda_host_free(void * ptr) { + CUDA_CHECK(cudaFreeHost(ptr)); +} + +static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) { const uint64_t ne0 = src->ne[0]; const uint64_t ne1 = src->ne[1]; const uint64_t nb0 = src->nb[0]; @@ -354,22 +424,293 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, } } -void * ggml_cuda_host_malloc(size_t size) { - if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { - return nullptr; +static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + const int n_mm = ne03 * ne02; + + size_t x_size, y_size, d_size; + float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size); + float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size); + float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + int i = i03*ne02 + i02; + cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS]; + + float * c_X = d_X + i * x_ne; + float * c_Y = d_Y + i * y_ne; + float * c_D = d_D + i * d_ne; + + // copy data to device + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream)); + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream)); + + // compute + CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream)); + CUBLAS_CHECK( + cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha, c_X, ne00, + c_Y, ne10, + &beta, c_D, ne01)); + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); + } } - void * ptr = nullptr; - cudaError_t err = cudaMallocHost((void **) &ptr, size); - if (err != cudaSuccess) { - fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", - size/1024.0/1024.0, cudaGetErrorString(err)); - return nullptr; + CUDA_CHECK(cudaDeviceSynchronize()); + ggml_cuda_pool_free(d_X, x_size); + ggml_cuda_pool_free(d_Y, y_size); + ggml_cuda_pool_free(d_D, d_size); +} + +static void ggml_cuda_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb10 = src1->nb[0]; + const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + const int nb13 = src1->nb[3]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + const int n_mm = ne03 * ne02; + + size_t x_size, y_size, d_size; + half * d_X = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * x_ne, &x_size); + half * d_Y = (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * y_ne, &y_size); + float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size); + + bool src1_cont_rows = nb10 == sizeof(float); + bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + int i = i03*ne02 + i02; + cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS]; + + half * c_X = d_X + i * x_ne; + half * c_Y = d_Y + i * y_ne; + float * c_D = d_D + i * d_ne; + + // copy src0 to device + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream)); + + // convert src1 to fp16 + // TODO: use multiple threads + ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02); + char * src1i = (char *) src1->data + i03*nb13 + i02*nb12; + if (src1_cont_rows) { + if (src1_cont_cols) { + ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); + } + else { + for (int64_t i01 = 0; i01 < ne11; i01++) { + ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10); + } + } + } + else { + for (int64_t i01 = 0; i01 < ne11; i01++) { + for (int64_t i00 = 0; i00 < ne10; i00++) { + // very slow due to no inlining + tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10)); + } + } + } + + // copy src1 to device + CUDA_CHECK(cudaMemcpyAsync(c_Y, tmp, sizeof(half) * y_ne, cudaMemcpyHostToDevice, cudaStream)); + + // compute + CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream)); + CUBLAS_CHECK( + cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha, c_X, CUDA_R_16F, ne00, + c_Y, CUDA_R_16F, ne10, + &beta, c_D, CUDA_R_32F, ne01, + CUBLAS_COMPUTE_32F_FAST_16F, + CUBLAS_GEMM_DEFAULT)); + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); + } } - return ptr; + CUDA_CHECK(cudaDeviceSynchronize()); + ggml_cuda_pool_free(d_X, x_size); + ggml_cuda_pool_free(d_Y, y_size); + ggml_cuda_pool_free(d_D, d_size); } -void ggml_cuda_host_free(void * ptr) { - CUDA_CHECK(cudaFreeHost(ptr)); +static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + + const int nb2 = dst->nb[2]; + const int nb3 = dst->nb[3]; + const ggml_type type = src0->type; + + const float alpha = 1.0f; + const float beta = 0.0f; + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + const int n_mm = ne03 * ne02; + const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type); + + size_t x_size, y_size, d_size, q_size; + float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size); + float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size); + float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size); + char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size); + + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type); + GGML_ASSERT(to_fp32_cuda != nullptr); + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + int i = i03*ne02 + i02; + cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS]; + cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS]; + cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS]; + + float * c_X = d_X + i * x_ne; + float * c_Y = d_Y + i * y_ne; + float * c_D = d_D + i * d_ne; + char * c_Q = d_Q + i * q_sz; + + // copy src0 and convert to fp32 on device + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2)); + to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2); + CUDA_CHECK(cudaGetLastError()); + CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2)); + + // copy src1 to device + CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream)); + + // wait for conversion + CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0)); + + // compute + CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream)); + CUBLAS_CHECK( + cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha, c_X, ne00, + c_Y, ne10, + &beta, c_D, ne01)); + + // copy dst to host + float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream)); + } + } + + CUDA_CHECK(cudaDeviceSynchronize()); + ggml_cuda_pool_free(d_X, x_size); + ggml_cuda_pool_free(d_Y, y_size); + ggml_cuda_pool_free(d_D, d_size); + ggml_cuda_pool_free(d_Q, q_size); +} + +bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + const int64_t ne10 = src1->ne[0]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + + // TODO: find the optimal values for these + if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && + src1->type == GGML_TYPE_F32 && + dst->type == GGML_TYPE_F32 && + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { + + return true; + } + + return false; +} + +bool ggml_cuda_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { + size_t src0_sz = ggml_nbytes(src0); + size_t src1_sz = ggml_nbytes(src1); + + // mul_mat_q: src0 is converted to fp32 on device + size_t mul_mat_q_transfer = src0_sz + src1_sz; + + // mul_mat_f16: src1 is converted to fp16 on cpu + size_t mul_mat_f16_transfer = src0_sz + sizeof(half) * ggml_nelements(src1); + + // choose the smaller one to transfer to the device + // TODO: this is not always the best choice due to the overhead of converting to fp16 + return mul_mat_f16_transfer < mul_mat_q_transfer; +} + +void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) { + GGML_ASSERT(ggml_cuda_can_mul_mat(src0, src1, dst)); + + if (src0->type == GGML_TYPE_F32) { + ggml_cuda_mul_mat_f32(src0, src1, dst); + } + else if (src0->type == GGML_TYPE_F16) { + if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) { + ggml_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize); + } + else { + ggml_cuda_mul_mat_q_f32(src0, src1, dst); + } + } + else if (ggml_is_quantized(src0->type)) { + ggml_cuda_mul_mat_q_f32(src0, src1, dst); + } + else { + GGML_ASSERT(false); + } +} + +size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { + if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) { + return ggml_nelements(src1) * sizeof(ggml_fp16_t); + } + else { + return 0; + } } diff --git a/ggml-cuda.h b/ggml-cuda.h index 36782d9e7..f7d6a8bc1 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -1,54 +1,19 @@ -#include -#include #include "ggml.h" #ifdef __cplusplus extern "C" { #endif -#define CUDA_CHECK(err) \ - do { \ - cudaError_t err_ = (err); \ - if (err_ != cudaSuccess) { \ - fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ - cudaGetErrorString(err_)); \ - exit(1); \ - } \ - } while (0) - -#define CUBLAS_CHECK(err) \ - do { \ - cublasStatus_t err_ = (err); \ - if (err_ != CUBLAS_STATUS_SUCCESS) { \ - fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \ - exit(1); \ - } \ - } while (0) - -extern cublasHandle_t g_cublasH; -extern cudaStream_t g_cudaStream; -extern cudaStream_t g_cudaStream2; -extern cudaEvent_t g_cudaEvent; - void ggml_init_cublas(void); + +bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); + +// TODO: export these with GGML_API void * ggml_cuda_host_malloc(size_t size); void ggml_cuda_host_free(void * ptr); -void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size); -void ggml_cuda_pool_free(void * ptr, size_t size); - -void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); -void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream); -void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream); -void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); -void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream); -void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream); - -cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream); - -typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); -dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type); - #ifdef __cplusplus } #endif diff --git a/ggml.c b/ggml.c index 5b5ed925e..bce7a7a57 100644 --- a/ggml.c +++ b/ggml.c @@ -135,14 +135,6 @@ inline static void* ggml_aligned_malloc(size_t size) { #define UNUSED(x) (void)(x) #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) -#define GGML_ASSERT(x) \ - do { \ - if (!(x)) { \ - fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ - abort(); \ - } \ - } while (0) - #if defined(GGML_USE_ACCELERATE) #include #elif defined(GGML_USE_OPENBLAS) @@ -370,6 +362,32 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) { return GGML_FP32_TO_FP16(x); } +void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) { + for (size_t i = 0; i < n; i++) { + y[i] = GGML_FP16_TO_FP32(x[i]); + } +} + +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) { + size_t i = 0; +#if defined(__F16C__) + for (; i + 7 < n; i += 8) { + __m256 x_vec = _mm256_loadu_ps(x + i); + __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storeu_si128((__m128i *)(y + i), y_vec); + } + for(; i + 3 < n; i += 4) { + __m128 x_vec = _mm_loadu_ps(x + i); + __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64((__m128i *)(y + i), y_vec); + } +#endif + for (; i < n; i++) { + y[i] = GGML_FP32_TO_FP16(x[i]); + } +} + + // // timing // @@ -4325,12 +4343,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } - // initialize cuBLAS - #if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUBLAS) ggml_init_cublas(); - #elif defined(GGML_USE_CLBLAST) +#elif defined(GGML_USE_CLBLAST) ggml_cl_init(); - #endif +#endif is_first_call = false; } @@ -8101,7 +8118,7 @@ static void ggml_compute_forward_rms_norm( // ggml_compute_forward_mul_mat -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster static bool ggml_compute_forward_mul_mat_use_blas( @@ -8117,12 +8134,9 @@ static bool ggml_compute_forward_mul_mat_use_blas( const int64_t ne1 = dst->ne[1]; // TODO: find the optimal values for these - if ( -#if !defined(GGML_USE_CUBLAS) - ggml_is_contiguous(src0) && + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && -#endif - ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) { + (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) { /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/ return true; @@ -8130,7 +8144,6 @@ static bool ggml_compute_forward_mul_mat_use_blas( return false; } - #endif static void ggml_compute_forward_mul_mat_f32( @@ -8146,7 +8159,7 @@ static void ggml_compute_forward_mul_mat_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) const int64_t ne10 = src1->ne[0]; #endif const int64_t ne11 = src1->ne[1]; @@ -8203,7 +8216,16 @@ static void ggml_compute_forward_mul_mat_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CUBLAS) + if (ggml_cuda_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } +#endif + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -8217,43 +8239,13 @@ static void ggml_compute_forward_mul_mat_f32( return; } -#if defined(GGML_USE_CUBLAS) - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - - size_t x_size, y_size, d_size; - float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); - float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); - float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); -#endif - for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { -#if !defined(GGML_USE_CUBLAS) const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03); const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); -#endif float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); -#if defined(GGML_USE_CUBLAS) - // copy data to device - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream)); - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream)); - - // compute - CUBLAS_CHECK( - cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - &alpha, d_X, ne00, - d_Y, ne10, - &beta, d_D, ne01)); - - // copy data to host - CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream)); -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) // zT = y * xT ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, ne11, ne01, ne10, @@ -8270,12 +8262,6 @@ static void ggml_compute_forward_mul_mat_f32( #endif } } -#if defined(GGML_USE_CUBLAS) - CUDA_CHECK(cudaStreamSynchronize(g_cudaStream)); - ggml_cuda_pool_free(d_X, x_size); - ggml_cuda_pool_free(d_Y, y_size); - ggml_cuda_pool_free(d_D, d_size); -#endif //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; @@ -8405,7 +8391,16 @@ static void ggml_compute_forward_mul_mat_f16_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CUBLAS) + if (ggml_cuda_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } +#endif + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); @@ -8421,37 +8416,8 @@ static void ggml_compute_forward_mul_mat_f16_f32( return; } -#if defined(GGML_USE_CUBLAS) - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - - size_t x_size, y_size, d_size; - ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); - ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); - float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); -#endif for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { -#if defined(GGML_USE_CUBLAS) - // copy src0 while converting src1 - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_X, src0, i03, i02, g_cudaStream)); - - // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16 - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02); - { - size_t id = 0; - for (int64_t i01 = 0; i01 < ne11; ++i01) { - for (int64_t i00 = 0; i00 < ne10; ++i00) { - wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10)); - } - } - - assert(id*sizeof(ggml_fp16_t) <= params->wsize); - } -#else float * const wdata = params->wdata; { size_t id = 0; @@ -8463,28 +8429,8 @@ static void ggml_compute_forward_mul_mat_f16_f32( assert(id*sizeof(float) <= params->wsize); } -#endif -#if defined(GGML_USE_CUBLAS) - const ggml_fp16_t * y = (ggml_fp16_t *) wdata; - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - // copy data to device - CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream)); - - // compute - CUBLAS_CHECK( - cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - &alpha, d_X, CUDA_R_16F, ne00, - d_Y, CUDA_R_16F, ne10, - &beta, d_D, CUDA_R_32F, ne01, - CUBLAS_COMPUTE_32F, - CUBLAS_GEMM_DEFAULT)); - - // copy data to host - CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream)); -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) const float * x = wdata; const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); @@ -8513,12 +8459,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( } } -#if defined(GGML_USE_CUBLAS) - CUDA_CHECK(cudaStreamSynchronize(g_cudaStream)); - ggml_cuda_pool_free(d_X, x_size); - ggml_cuda_pool_free(d_Y, y_size); - ggml_cuda_pool_free(d_D, d_size); -#endif /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/ return; @@ -8671,7 +8611,16 @@ static void ggml_compute_forward_mul_mat_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CUBLAS) + if (ggml_cuda_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } +#endif + +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -8685,25 +8634,8 @@ static void ggml_compute_forward_mul_mat_q_f32( return; } -#if defined(GGML_USE_CUBLAS) - const float alpha = 1.0f; - const float beta = 0.0f; - const int x_ne = ne01 * ne00; - const int y_ne = ne11 * ne10; - const int d_ne = ne11 * ne01; - - size_t x_size, y_size, d_size, q_size; - float * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); - float * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); - float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); - void * d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size); - - const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda(type); - GGML_ASSERT(dequantize_row_q_cuda != NULL); -#else float * const wdata = params->wdata; dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q; -#endif for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -8711,14 +8643,7 @@ static void ggml_compute_forward_mul_mat_q_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); -#if defined(GGML_USE_CUBLAS) - // copy and dequantize on device - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, src0, i03, i02, g_cudaStream2)); - - dequantize_row_q_cuda(d_Q, d_X, x_ne, g_cudaStream2); - CUDA_CHECK(cudaGetLastError()); - CUDA_CHECK(cudaEventRecord(g_cudaEvent, g_cudaStream2)); -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) const void* x = (char *) src0->data + i03*nb03 + i02*nb02; #else { @@ -8734,24 +8659,7 @@ static void ggml_compute_forward_mul_mat_q_f32( const float * x = wdata; #endif -#if defined(GGML_USE_CUBLAS) - // copy data to device - CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream)); - - // wait for dequantization - CUDA_CHECK(cudaStreamWaitEvent(g_cudaStream, g_cudaEvent, 0)); - - // compute - CUBLAS_CHECK( - cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - &alpha, d_X, ne00, - d_Y, ne10, - &beta, d_D, ne01)); - - // copy data to host - CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream)); -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) // zT = y * xT ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, ne11, ne01, ne10, @@ -8769,13 +8677,6 @@ static void ggml_compute_forward_mul_mat_q_f32( } } -#if defined(GGML_USE_CUBLAS) - CUDA_CHECK(cudaStreamSynchronize(g_cudaStream)); - ggml_cuda_pool_free(d_X, x_size); - ggml_cuda_pool_free(d_Y, y_size); - ggml_cuda_pool_free(d_D, d_size); - ggml_cuda_pool_free(d_Q, q_size); -#endif //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); return; @@ -11759,18 +11660,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) size_t cur = 0; +#if defined(GGML_USE_CUBLAS) + if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) { + node->n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node); + } + else +#endif if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning -#if defined(GGML_USE_CUBLAS) - // with cuBLAS, we need memory for the full 3D / 4D data of src1 - cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); -#else // here we need memory just for single 2D matrix from src0 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); -#endif } else { cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); } @@ -11779,13 +11683,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) #endif } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; } #endif } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); diff --git a/ggml.h b/ggml.h index d6feacd78..ef5a048c3 100644 --- a/ggml.h +++ b/ggml.h @@ -197,6 +197,14 @@ #define GGML_MAX_OPT 4 #define GGML_DEFAULT_N_THREADS 4 +#define GGML_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + #ifdef __cplusplus extern "C" { #endif @@ -212,6 +220,9 @@ extern "C" { GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x); + GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n); + GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n); + struct ggml_object; struct ggml_context; From f4cef87edfd1b2f8d5befd4fde54ca2e03987bea Mon Sep 17 00:00:00 2001 From: DannyDaemonic Date: Mon, 1 May 2023 09:23:47 -0700 Subject: [PATCH 7/7] Add git-based build information for better issue tracking (#1232) * Add git-based build information for better issue tracking * macOS fix * "build (hash)" and "CMAKE_SOURCE_DIR" changes * Redo "CMAKE_CURRENT_SOURCE_DIR" and clearer build messages * Fix conditional dependency on missing target * Broke out build-info.cmake, added find_package fallback, and added build into to all examples, added dependencies to Makefile * 4 space indenting for cmake, attempt to clean up my mess in Makefile * Short hash, less fancy Makefile, and don't modify build-info.h if it wouldn't change it --- .gitignore | 1 + CMakeLists.txt | 35 +++++++++++++ Makefile | 51 ++++++++++++------- examples/benchmark/CMakeLists.txt | 3 ++ examples/benchmark/benchmark-matmult.cpp | 4 +- examples/embedding/CMakeLists.txt | 3 ++ examples/embedding/embedding.cpp | 5 +- examples/main/CMakeLists.txt | 3 ++ examples/main/main.cpp | 5 +- examples/perplexity/CMakeLists.txt | 3 ++ examples/perplexity/perplexity.cpp | 5 +- examples/quantize-stats/quantize-stats.cpp | 3 ++ examples/quantize/CMakeLists.txt | 3 ++ examples/quantize/quantize.cpp | 3 ++ examples/save-load-state/CMakeLists.txt | 3 ++ examples/save-load-state/save-load-state.cpp | 3 ++ scripts/build-info.cmake | 53 ++++++++++++++++++++ scripts/build-info.sh | 22 ++++++++ 18 files changed, 186 insertions(+), 22 deletions(-) create mode 100644 scripts/build-info.cmake create mode 100755 scripts/build-info.sh diff --git a/.gitignore b/.gitignore index 565866fd4..e479c6180 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ models/* /vdot /Pipfile +build-info.h arm_neon.h compile_commands.json diff --git a/CMakeLists.txt b/CMakeLists.txt index 098306126..f6a66daa3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,41 @@ option(LLAMA_CLBLAST "llama: use CLBlast" option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) +# +# Build info header +# + +# Write header template to binary dir to keep source directory clean +file(WRITE "${CMAKE_BINARY_DIR}/BUILD_INFO.h.in" "\ +#ifndef BUILD_INFO_H\n\ +#define BUILD_INFO_H\n\ +\n\ +#define BUILD_NUMBER @BUILD_NUMBER@\n\ +#define BUILD_COMMIT \"@BUILD_COMMIT@\"\n\ +\n\ +#endif // BUILD_INFO_H\n\ +") + +# Generate initial build-info.h +include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) + +if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") + # Add a custom target for build-info.h + add_custom_target(BUILD_INFO ALL DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") + + # Add a custom command to rebuild build-info.h when .git/index changes + add_custom_command( + OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h" + COMMENT "Generating build details from Git" + COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/.git/index" + VERBATIM + ) +else() + message(WARNING "Git repository not found; to enable automatic generation of build info, make sure Git is installed and the project is a Git repository.") +endif() + # # Compile flags # diff --git a/Makefile b/Makefile index 1d62a4438..6ebc3c5b9 100644 --- a/Makefile +++ b/Makefile @@ -181,41 +181,56 @@ llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c $< -o $@ -clean: - rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult +libllama.so: llama.o ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) -main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +clean: + rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state build-info.h + +# +# Examples +# + +main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -libllama.so: llama.o ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) +build-info.h: $(wildcard .git/index) scripts/build-info.sh + @scripts/build-info.sh > $@.tmp + @if ! cmp -s $@.tmp $@; then \ + mv $@.tmp $@; \ + else \ + rm $@.tmp; \ + fi # # Tests # -benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) +benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ./$@ +vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + .PHONY: tests tests: bash ./tests/run-tests.sh diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt index 05deebcd1..037696194 100644 --- a/examples/benchmark/CMakeLists.txt +++ b/examples/benchmark/CMakeLists.txt @@ -2,3 +2,6 @@ set(TARGET benchmark) add_executable(${TARGET} benchmark-matmult.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp index 19cbab1c3..2cc1a1477 100644 --- a/examples/benchmark/benchmark-matmult.cpp +++ b/examples/benchmark/benchmark-matmult.cpp @@ -1,5 +1,6 @@ #include #include "ggml.h" +#include "build-info.h" #include #include #include @@ -90,9 +91,10 @@ int main(int argc, char ** argv) { } } - // create the ggml context + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); printf("Starting Test\n"); + // create the ggml context struct ggml_context * ctx; //const int sizex = 4096; //const int sizey = 11008; diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 88c425d4a..db73b6b44 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -2,3 +2,6 @@ set(TARGET embedding) add_executable(${TARGET} embedding.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index e10de619c..b3e001476 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -1,5 +1,6 @@ #include "common.h" #include "llama.h" +#include "build-info.h" #include @@ -18,11 +19,13 @@ int main(int argc, char ** argv) { "expect poor results\n", __func__, params.n_ctx); } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + if (params.seed <= 0) { params.seed = time(NULL); } - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); std::mt19937 rng(params.seed); if (params.random_prompt) { diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index b2dcc2910..c364242fb 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -2,3 +2,6 @@ set(TARGET main) add_executable(${TARGET} main.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 78fc9a197..7dc100512 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -5,6 +5,7 @@ #include "common.h" #include "llama.h" +#include "build-info.h" #include #include @@ -81,11 +82,13 @@ int main(int argc, char ** argv) { "expect poor results\n", __func__, params.n_ctx); } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + if (params.seed <= 0) { params.seed = time(NULL); } - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); std::mt19937 rng(params.seed); if (params.random_prompt) { diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt index 5836df8b2..61b17b828 100644 --- a/examples/perplexity/CMakeLists.txt +++ b/examples/perplexity/CMakeLists.txt @@ -2,3 +2,6 @@ set(TARGET perplexity) add_executable(${TARGET} perplexity.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 615157e7b..2ca338835 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1,5 +1,6 @@ #include "common.h" #include "llama.h" +#include "build-info.h" #include #include @@ -106,11 +107,13 @@ int main(int argc, char ** argv) { "expect poor results\n", __func__, params.n_ctx); } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + if (params.seed <= 0) { params.seed = time(NULL); } - fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); + fprintf(stderr, "%s: seed = %d\n", __func__, params.seed); std::mt19937 rng(params.seed); if (params.random_prompt) { diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 4e6c2c831..9a2aa7c64 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -1,4 +1,5 @@ #include "ggml.h" +#include "build-info.h" #define LLAMA_API_INTERNAL #include "llama.h" @@ -308,6 +309,8 @@ int main(int argc, char ** argv) { return 1; } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + // load the model fprintf(stderr, "Loading model\n"); diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index fb27d4517..475fc8be8 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -2,3 +2,6 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index dd175c690..198bd5fcb 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "llama.h" +#include "build-info.h" #include #include @@ -50,6 +51,8 @@ int main(int argc, char ** argv) { ftype = (enum llama_ftype)atoi(argv[3]); } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + int nthread = argc > 4 ? atoi(argv[4]) : 0; const int64_t t_main_start_us = ggml_time_us(); diff --git a/examples/save-load-state/CMakeLists.txt b/examples/save-load-state/CMakeLists.txt index cff79fa1f..08dbe5c2b 100644 --- a/examples/save-load-state/CMakeLists.txt +++ b/examples/save-load-state/CMakeLists.txt @@ -2,3 +2,6 @@ set(TARGET save-load-state) add_executable(${TARGET} save-load-state.cpp) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) +if(TARGET BUILD_INFO) + add_dependencies(${TARGET} BUILD_INFO) +endif() diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index f1531ba39..ea0a984d9 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,5 +1,6 @@ #include "common.h" #include "llama.h" +#include "build-info.h" #include #include @@ -17,6 +18,8 @@ int main(int argc, char ** argv) { return 1; } + fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT); + if (params.n_predict < 0) { params.n_predict = 16; } diff --git a/scripts/build-info.cmake b/scripts/build-info.cmake new file mode 100644 index 000000000..fb46ed2b5 --- /dev/null +++ b/scripts/build-info.cmake @@ -0,0 +1,53 @@ +set(TEMPLATE_FILE "${CMAKE_BINARY_DIR}/BUILD_INFO.h.in") +set(HEADER_FILE "${CMAKE_CURRENT_SOURCE_DIR}/build-info.h") +set(BUILD_NUMBER 0) +set(BUILD_COMMIT "unknown") + +# Look for git +find_package(Git) +if(NOT Git_FOUND) + execute_process( + COMMAND which git + OUTPUT_VARIABLE GIT_EXECUTABLE + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(NOT GIT_EXECUTABLE STREQUAL "") + set(Git_FOUND TRUE) + message(STATUS "Found Git using 'which': ${GIT_EXECUTABLE}") + else() + message(WARNING "Git not found using 'find_package' or 'which'. Build info will not be accurate. Consider installing Git or ensuring it is in the PATH.") + endif() +endif() + +# Get the commit count and hash +if(Git_FOUND) + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE HEAD + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE GIT_HEAD_RESULT + ) + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE COUNT + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE GIT_COUNT_RESULT + ) + if(GIT_HEAD_RESULT EQUAL 0 AND GIT_COUNT_RESULT EQUAL 0) + set(BUILD_COMMIT ${HEAD}) + set(BUILD_NUMBER ${COUNT}) + endif() +endif() + +# Only write the header if it's changed to prevent unnecessary recompilation +if(EXISTS ${HEADER_FILE}) + file(STRINGS ${HEADER_FILE} CONTENTS REGEX "BUILD_COMMIT \"([^\"]*)\"") + list(GET CONTENTS 0 EXISTING) + if(NOT EXISTING STREQUAL "#define BUILD_COMMIT \"${BUILD_COMMIT}\"") + configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) + endif() +else() + configure_file(${TEMPLATE_FILE} ${HEADER_FILE}) +endif() diff --git a/scripts/build-info.sh b/scripts/build-info.sh new file mode 100755 index 000000000..507d7e153 --- /dev/null +++ b/scripts/build-info.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +BUILD_NUMBER="0" +BUILD_COMMIT="unknown" + +REV_LIST=$(git rev-list --count HEAD) +if [ $? -eq 0 ]; then + BUILD_NUMBER=$REV_LIST +fi + +REV_PARSE=$(git rev-parse --short HEAD) +if [ $? -eq 0 ]; then + BUILD_COMMIT=$REV_PARSE +fi + +echo "#ifndef BUILD_INFO_H" +echo "#define BUILD_INFO_H" +echo "" +echo "#define BUILD_NUMBER $BUILD_NUMBER" +echo "#define BUILD_COMMIT \"$BUILD_COMMIT\"" +echo "" +echo "#endif // BUILD_INFO_H"