diff --git a/Makefile b/Makefile index 6170eedbd..0c92ed164 100644 --- a/Makefile +++ b/Makefile @@ -160,7 +160,8 @@ endif ifneq ($(filter ppc64%,$(UNAME_M)),) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) ifneq (,$(findstring POWER9,$(POWER9_M))) - CFLAGS += -mpower9-vector + CFLAGS += -mcpu=power9 + CXXFLAGS += -mcpu=power9 endif # Require c++23's std::byteswap for big-endian support. ifeq ($(UNAME_M),ppc64) diff --git a/alpaca.sh b/alpaca.sh index 2f36d6f54..d8a9f456a 100755 --- a/alpaca.sh +++ b/alpaca.sh @@ -3,4 +3,4 @@ # Temporary script - will be removed in the future # -./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 +./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 diff --git a/chat.sh b/chat.sh index 24a0f10ad..5531315b3 100755 --- a/chat.sh +++ b/chat.sh @@ -3,4 +3,4 @@ # Temporary script - will be removed in the future # -./main -m ./models/7B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt +./main -m ./models/7B/ggml-model-q4_0.bin -b 128 -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt diff --git a/examples/chatLLaMa b/examples/chatLLaMa index 97ababbc5..4265d7b66 100755 --- a/examples/chatLLaMa +++ b/examples/chatLLaMa @@ -13,7 +13,7 @@ N_PREDICTS="${N_PREDICTS:-2048}" # Note: you can also override the generation options by specifying them on the command line: # For example, override the context size by doing: ./chatLLaMa --ctx_size 1024 -GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --repeat_penalty 1.17647}" +GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}" # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS ./main $GEN_OPTIONS \ diff --git a/expose.cpp b/expose.cpp index f473d8f5f..7cba6a8f1 100644 --- a/expose.cpp +++ b/expose.cpp @@ -192,7 +192,7 @@ extern "C" { std::string concat_output = ""; bool startedsampling = false; - printf("\nProcessing Prompt: "); + printf("\nProcessing Prompt (%d tokens): ",embd_inp.size()); while (remaining_tokens > 0) { @@ -224,7 +224,7 @@ extern "C" { if(!startedsampling) { startedsampling = true; - printf("\nGenerating: "); + printf("\nGenerating (%d tokens): ",params.n_predict); } { diff --git a/ggml.c b/ggml.c index 0e4b1466c..db68ed144 100644 --- a/ggml.c +++ b/ggml.c @@ -1,5 +1,5 @@ -// Defines CLOCK_MONOTONIC on Linux -#define _POSIX_C_SOURCE 199309L +// Defines CLOCK_MONOTONIC and asprintf on Linux +#define _GNU_SOURCE #include "ggml.h" @@ -10,6 +10,7 @@ #endif #include +#include #include #include #include @@ -31,7 +32,6 @@ #else // ref: https://github.com/ggerganov/whisper.cpp/issues/168 #include -#include #endif typedef volatile LONG atomic_int; @@ -83,6 +83,17 @@ typedef void* thread_ret_t; #define static_assert(cond, msg) _Static_assert(cond, msg) #endif +#define GGML_MLOCK_SUPPORT 0 + +#ifdef __has_include + #if __has_include() + #undef GGML_MLOCK_SUPPORT + #define GGML_MLOCK_SUPPORT 1 + #include + #endif +#endif + + /*#define GGML_PERF*/ #define GGML_DEBUG 0 #define GGML_GELU_FP16 @@ -164,6 +175,39 @@ typedef double ggml_float; #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) +#elif defined(__POWER9_VECTOR__) + +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +/* the inline asm below is about 12% faster than the lookup method */ +#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + register float f; + register double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { + register double d; + register ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; +} + #else // FP16 <-> FP32 @@ -261,6 +305,7 @@ static float table_f32_f16[1 << 16]; // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. +// This is also true for POWER9. #if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16) inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { @@ -451,7 +496,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { assert(k % QK == 0); -#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) +#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__) const int nb = k / QK; const size_t bs = sizeof(float) + QK/2; @@ -461,7 +506,52 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { uint8_t pp[QK/2]; #endif -#if __ARM_NEON +#if defined(__POWER9_VECTOR__) +#if QK == 32 + const vector float v85 = vec_splats(8.5f); + for (int i = 0; i < nb; i++) { + float amax = 0.0f; // absolute max + + vector float srcv [8]; + vector float asrcv[8]; + vector float amaxv[8]; + + for (int l = 0; l < 8; l++) srcv[l] = *(vector float *)(x + i*32 + 4*l); + for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]); + + for (int l = 0; l < 4; l++) amaxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]); + //for (int l = 0; l < 2; l++) amaxv[4*l] = vec_max(amaxv[4*l], amaxv[4*l+2]); + amaxv[0] = vec_max(amaxv[0], amaxv[2]); + amaxv[4] = vec_max(amaxv[4], amaxv[6]); + //for (int l = 0; l < 1; l++) amaxv[8*l] = vec_max(amaxv[8*l], amaxv[8*l+4]); + amaxv[0] = vec_max(amaxv[0], amaxv[4]); + + amax = MAX( + MAX(vec_extract(amaxv[0], 0), vec_extract(amaxv[0], 1)), + MAX(vec_extract(amaxv[0], 2), vec_extract(amaxv[0], 3))); + + const float d = amax / ((1 << 3) - 1); + const float id = d ? 1.0/d : 0.0; + + *(float *)pd = d; + pd += bs; + + const vector float vid = vec_splats(id); + for (int l = 0; l < 8; l++) { + const vector float vf = vec_madd(srcv[l], vid, v85); + const vector signed int vi = vec_signed(vf); + + pb[2*l + 0] = vec_extract(vi, 0) | (vec_extract(vi, 1) << 4); + pb[2*l + 1] = vec_extract(vi, 2) | (vec_extract(vi, 3) << 4); + } + + //memcpy(pb, pp, sizeof(pp)); + pb += bs; + } +#else +#error "not implemented for QK" +#endif +#elif __ARM_NEON #if QK == 32 for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -2344,6 +2434,7 @@ struct ggml_context { size_t mem_size; void * mem_buffer; bool mem_buffer_owned; + bool mem_buffer_mlocked; int n_objects; @@ -2619,16 +2710,19 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { } *ctx = (struct ggml_context) { - /*.mem_size =*/ params.mem_size, - /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), - /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, - /*.n_objects =*/ 0, - /*.objects_begin =*/ NULL, - /*.objects_end =*/ NULL, - /*.scratch =*/ { 0, 0, NULL, }, - /*.scratch_save =*/ { 0, 0, NULL, }, + /*.mem_size =*/ params.mem_size, + /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), + /*.mem_buffer_owned =*/ params.mem_buffer ? false : true, + /*.mem_buffer_mlocked =*/ false, + /*.n_objects =*/ 0, + /*.objects_begin =*/ NULL, + /*.objects_end =*/ NULL, + /*.scratch =*/ { 0, 0, NULL, }, + /*.scratch_save =*/ { 0, 0, NULL, }, }; + GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure + ggml_assert_aligned(ctx->mem_buffer); GGML_PRINT_DEBUG("%s: context initialized\n", __func__); @@ -2651,6 +2745,14 @@ void ggml_free(struct ggml_context * ctx) { GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n", __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); +#if GGML_MLOCK_SUPPORT + if (ctx->mem_buffer_mlocked) { + if (munlock(ctx->mem_buffer, ctx->mem_size)) { + fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno)); + } + } +#endif + if (ctx->mem_buffer_owned) { free(ctx->mem_buffer); } @@ -2679,6 +2781,37 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) return result; } +bool ggml_mlock_supported(void) { + return GGML_MLOCK_SUPPORT; +} + +#if GGML_MLOCK_SUPPORT +#ifdef __APPLE__ + #define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)." +#else + #define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)." +#endif +bool ggml_mlock(struct ggml_context * ctx, char ** err_p) { + if (ctx->mem_buffer_mlocked) { + return true; + } + if (mlock(ctx->mem_buffer, ctx->mem_size)) { + int ret = asprintf(err_p, "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION, + ctx->mem_size, strerror(errno)); + GGML_ASSERT(ret >= 0); + return false; + } + ctx->mem_buffer_mlocked = true; + return true; +} +#else // GGML_MLOCK_SUPPORT +bool ggml_mlock(struct ggml_context * ctx, char ** err_p) { + *err_p = strdup("can't mlock because it's not supported on this system"); + return false; +} +#endif // GGML_MLOCK_SUPPORT + //////////////////////////////////////////////////////////////////////////////// struct ggml_tensor * ggml_new_tensor_impl( @@ -5713,17 +5846,28 @@ static bool ggml_compute_forward_mul_mat_use_blas( const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - UNUSED(src0); + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; const int ne10 = src1->ne[0]; const int ne0 = dst->ne[0]; const int ne1 = dst->ne[1]; + // TMP: disable BLAS for now there is definitely a bug + return false; + // TODO: find the optimal values for these if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) { - //printf("BLAS: %d %d %d\n", ne0, ne1, ne10); + + // disable BLAS for Q4_0 and Q4_1 + // there is a bug that has to be fixed before enabling + if (src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1) { + return false; + } + + //printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01); return true; } diff --git a/ggml.h b/ggml.h index c7e6814a8..ddb97318b 100644 --- a/ggml.h +++ b/ggml.h @@ -343,6 +343,9 @@ size_t ggml_used_mem(const struct ggml_context * ctx); size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); +bool ggml_mlock_supported(void); +bool ggml_mlock(struct ggml_context * ctx, char ** err_p); + struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, enum ggml_type type, diff --git a/llama.cpp b/llama.cpp index 710f83a10..fd922e426 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5,12 +5,25 @@ #include #include #include +#include #include #include #include #include #include +#define LLAMA_USE_SCRATCH +#define LLAMA_MAX_SCRATCH_BUFFERS 16 + +#define LLAMA_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + + // determine number of model parts based on the dimension static const std::unordered_map LLAMA_N_PARTS = { { 4096, 1 }, @@ -19,6 +32,52 @@ static const std::unordered_map LLAMA_N_PARTS = { { 8192, 8 }, }; +// available llama models +enum e_model { + MODEL_UNKNOWN, + MODEL_7B, + MODEL_13B, + MODEL_30B, + MODEL_65B, +}; + +static const size_t MB = 1024*1024; + +// computed for n_ctx == 2048 +// TODO: dynamically determine these sizes +// needs modifications in ggml + +static const std::map MEM_REQ_SCRATCH0 = { + { MODEL_7B, 512ull*MB }, + { MODEL_13B, 512ull*MB }, + { MODEL_30B, 512ull*MB }, + { MODEL_65B, 512ull*MB }, +}; + +static const std::map MEM_REQ_SCRATCH1 = { + { MODEL_7B, 512ull*MB }, + { MODEL_13B, 512ull*MB }, + { MODEL_30B, 512ull*MB }, + { MODEL_65B, 512ull*MB }, +}; + +// 2*n_embd*n_ctx*n_layer*sizeof(float16) +static const std::map MEM_REQ_KV_SELF = { + { MODEL_7B, 1026ull*MB }, + { MODEL_13B, 1608ull*MB }, + { MODEL_30B, 3124ull*MB }, + { MODEL_65B, 5120ull*MB }, +}; + +// this is mostly needed for temporary mul_mat buffers to dequantize the data +// not actually needed if BLAS is disabled +static const std::map MEM_REQ_EVAL = { + { MODEL_7B, 768ull*MB }, + { MODEL_13B, 1024ull*MB }, + { MODEL_30B, 1280ull*MB }, + { MODEL_65B, 1536ull*MB }, +}; + // default hparams (LLaMA 7B) struct llama_hparams { int32_t n_vocab = 32000; @@ -50,7 +109,20 @@ struct llama_layer { struct ggml_tensor * w3; }; +struct llama_kv_cache { + struct ggml_tensor * k; + struct ggml_tensor * v; + + struct ggml_context * ctx; + + std::vector buf; + + int n; // number of tokens currently in the cache +}; + struct llama_model { + e_model type = MODEL_UNKNOWN; + llama_hparams hparams; struct ggml_tensor * tok_embeddings; @@ -60,12 +132,18 @@ struct llama_model { std::vector layers; - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // + // context struct ggml_context * ctx; + + // key + value cache for the self attention + // TODO: move to llama_state + struct llama_kv_cache kv_self; + + // the model memory buffer + std::vector buf; + + // tensors + int n_loaded; std::unordered_map tensors; }; @@ -102,8 +180,91 @@ struct llama_context { // decode output (2-dimensional array: [n_tokens][n_vocab]) std::vector logits; bool logits_all = false; + + // input embedding (1-dimensional array: [n_embd]) + std::vector embedding; + + // memory buffers used to evaluate the model + // TODO: move in llama_state + std::vector buf_compute; + std::vector buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS]; + + int buf_last = 0; + size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 }; + + void use_buf(struct ggml_context * ctx, int i) { +#if defined(LLAMA_USE_SCRATCH) + size_t last_size = 0; + + if (i == -1) { + last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, }); + } else { + auto & buf = buf_scratch[i]; + last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), }); + } + + if (buf_last >= 0) { + buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size); + } + + buf_last = i; +#else + (void) i; + (void) ctx; +#endif + } + + size_t get_buf_max_mem(int i) const { +#if defined(LLAMA_USE_SCRATCH) + return buf_max_size[i]; +#else + (void) i; + return 0; +#endif + } }; +// +// kv cache +// + +static bool kv_cache_init( + const struct llama_hparams & hparams, + struct llama_kv_cache & cache, + ggml_type wtype, + int n_ctx) { + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + cache.buf.resize(2*n_elements*ggml_type_size(wtype) + 2u*MB); + + struct ggml_init_params params; + params.mem_size = cache.buf.size(); + params.mem_buffer = cache.buf.data(); + + cache.ctx = ggml_init(params); + + if (!cache.ctx) { + fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); + return false; + } + + cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); + + return true; +} + +static void kv_cache_free(struct llama_kv_cache & cache) { + if (cache.ctx) { + ggml_free(cache.ctx); + cache.ctx = nullptr; + } +} + struct llama_context_params llama_context_default_params() { struct llama_context_params result = { /*.n_ctx =*/ 512, @@ -112,6 +273,8 @@ struct llama_context_params llama_context_default_params() { /*.f16_kv =*/ false, /*.logits_all =*/ false, /*.vocab_only =*/ false, + /*.use_mlock =*/ false, + /*.embedding =*/ false, }; return result; @@ -203,6 +366,22 @@ static bool llama_model_load( fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__); } + if (hparams.n_layer == 32) { + model.type = e_model::MODEL_7B; + } + + if (hparams.n_layer == 40) { + model.type = e_model::MODEL_13B; + } + + if (hparams.n_layer == 60) { + model.type = e_model::MODEL_30B; + } + + if (hparams.n_layer == 80) { + model.type = e_model::MODEL_65B; + } + fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd); @@ -213,6 +392,7 @@ static bool llama_model_load( fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); + fprintf(stderr, "%s: type = %d\n", __func__, model.type); } // load vocab @@ -309,11 +489,32 @@ static bool llama_model_load( fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); } + // print memory requirements + { + const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1; + + // this is the total memory required to run the inference + const size_t mem_required = + ctx_size + + MEM_REQ_SCRATCH0.at(model.type) + + MEM_REQ_SCRATCH1.at(model.type) + + MEM_REQ_EVAL.at (model.type); + + // this is the memory required by one llama_state + const size_t mem_required_state = + scale*MEM_REQ_KV_SELF.at(model.type); + + fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, + mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); + } + // create the ggml context { + lctx.model.buf.resize(ctx_size); + struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, + /*.mem_size =*/ lctx.model.buf.size(), + /*.mem_buffer =*/ lctx.model.buf.data(), }; model.ctx = ggml_init(params); @@ -376,25 +577,6 @@ static bool llama_model_load( } } - // key + value memory - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - - const int n_mem = n_layer*n_ctx; - const int n_elements = n_embd*n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements); - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); - } - const size_t file_offset = fin.tellg(); fin.close(); @@ -418,9 +600,10 @@ static bool llama_model_load( // load weights { - int n_tensors = 0; size_t total_size = 0; + model.n_loaded = 0; + fprintf(stderr, "%s: ", __func__); while (true) { @@ -585,7 +768,10 @@ static bool llama_model_load( } //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); - if (++n_tensors % 8 == 0) { + model.n_loaded++; + + // progress + if (model.n_loaded % 8 == 0) { fprintf(stderr, "."); fflush(stderr); } @@ -593,14 +779,18 @@ static bool llama_model_load( fprintf(stderr, " done\n"); - fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); + fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded); + if (model.n_loaded == 0) { + fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__); + } else if (model.n_loaded != (int) model.tensors.size()) { + fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded); + return false; + } } fin.close(); } - lctx.logits.reserve(lctx.model.hparams.n_ctx); - lctx.t_load_us = ggml_time_us() - t_start_us; return true; @@ -626,6 +816,10 @@ static bool llama_eval_internal( const auto & model = lctx.model; const auto & hparams = model.hparams; + auto & kv_self = model.kv_self; + + LLAMA_ASSERT(!!kv_self.ctx); + const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; const int n_ctx = hparams.n_ctx; @@ -634,27 +828,11 @@ static bool llama_eval_internal( const int n_rot = hparams.n_embd/hparams.n_head; auto & mem_per_token = lctx.mem_per_token; - - // TODO: fix this hardcoded size - static size_t buf_size = 512u*1024*1024; - static void * buf = malloc(buf_size); - - if (mem_per_token > 0 && mem_per_token*N > buf_size) { - const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead - //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); - - // reallocate - buf_size = buf_size_new; - buf = realloc(buf, buf_size); - if (buf == nullptr) { - fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); - return false; - } - } + auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf, + /*.mem_size =*/ buf_compute.size(), + /*.mem_buffer =*/ buf_compute.data(), }; struct ggml_context * ctx0 = ggml_init(params); @@ -671,6 +849,8 @@ static bool llama_eval_internal( struct ggml_tensor * cur; + lctx.use_buf(ctx0, 0); + // norm { cur = ggml_rms_norm(ctx0, inpL); @@ -689,8 +869,8 @@ static bool llama_eval_internal( // store key and value to memory if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); @@ -711,7 +891,7 @@ static bool llama_eval_internal( ggml_permute(ctx0, ggml_rope(ctx0, ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), n_embd/n_head, n_head, n_past + N), n_past, n_rot, 1), 0, 2, 1, 3); @@ -737,7 +917,7 @@ static bool llama_eval_internal( ggml_cpy(ctx0, ggml_permute(ctx0, ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd), n_embd/n_head, n_head, n_past + N), 1, 2, 0, 3), ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); @@ -759,6 +939,8 @@ static bool llama_eval_internal( cur); } + lctx.use_buf(ctx0, 1); + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); // feed-forward network @@ -777,7 +959,6 @@ static bool llama_eval_internal( model.layers[il].w3, cur); - cur = ggml_mul_mat(ctx0, model.layers[il].w1, cur); @@ -792,26 +973,34 @@ static bool llama_eval_internal( cur); } - cur = ggml_add(ctx0, cur, inpFF); + cur = ggml_add(ctx0, cur, inpFF); // input for next layer inpL = cur; } + lctx.use_buf(ctx0, 0); + + // used at the end to optionally extract the embeddings + struct ggml_tensor * embeddings = NULL; + // norm { + inpL = ggml_rms_norm(ctx0, inpL); // inpL = norm*inpL inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm, inpL), inpL); + + embeddings = inpL; } // lm_head - { - inpL = ggml_mul_mat(ctx0, model.output, inpL); - } + inpL = ggml_mul_mat(ctx0, model.output, inpL); + + lctx.use_buf(ctx0, -1); // logits -> probs //inpL = ggml_soft_max(ctx0, inpL); @@ -828,21 +1017,38 @@ static bool llama_eval_internal( //embd_w.resize(n_vocab*N); //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); - auto & logits_out = lctx.logits; + // extract logits + { + auto & logits_out = lctx.logits; - if (lctx.logits_all) { - logits_out.resize(n_vocab * N); - memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N); - } else { - // return result for just the last token - logits_out.resize(n_vocab); - memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + if (lctx.logits_all) { + logits_out.resize(n_vocab * N); + memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N); + } else { + // return result for just the last token + logits_out.resize(n_vocab); + memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + } + } + + // extract embeddings + if (lctx.embedding.size()) { + auto & embedding_out = lctx.embedding; + + embedding_out.resize(n_embd); + memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); } if (mem_per_token == 0) { mem_per_token = ggml_used_mem(ctx0)/N; } - //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0)); + +#if 0 + printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__, + ggml_used_mem(ctx0)/1024.0/1024.0, + lctx.get_buf_max_mem(0)/1024.0/1024.0, + lctx.get_buf_max_mem(1)/1024.0/1024.0); +#endif ggml_free(ctx0); @@ -1415,19 +1621,64 @@ struct llama_context * llama_init_from_file( ctx->rng = std::mt19937(params.seed); ctx->logits_all = params.logits_all; - ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only)) { + if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type, + params.vocab_only)) { fprintf(stderr, "%s: failed to load model\n", __func__); - delete ctx; + llama_free(ctx); return nullptr; } + if (params.use_mlock) { + char *err; + if (!ggml_mlock(ctx->model.ctx, &err)) { + fprintf(stderr, "%s\n", err); + free(err); + llama_free(ctx); + return nullptr; + } + } + + // reserve memory for context buffers + { + if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { + fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); + llama_free(ctx); + return nullptr; + } + + { + const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); + fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); + } + + const auto & hparams = ctx->model.hparams; + if (params.logits_all) { + ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); + } else { + ctx->logits.reserve(hparams.n_ctx); + } + + if (params.embedding){ + ctx->embedding.reserve(hparams.n_embd); + } + + ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type)); + + ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type)); + ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type)); + } + return ctx; } void llama_free(struct llama_context * ctx) { - ggml_free(ctx->model.ctx); + kv_cache_free(ctx->model.kv_self); + + if (ctx->model.ctx) { + ggml_free(ctx->model.ctx); + } delete ctx; } @@ -1491,6 +1742,10 @@ float * llama_get_logits(struct llama_context * ctx) { return ctx->logits.data(); } +float * llama_get_embeddings(struct llama_context * ctx) { + return ctx->embedding.data(); +} + const char * llama_token_to_str(struct llama_context * ctx, llama_token token) { if (token >= llama_n_vocab(ctx)) { return nullptr; @@ -1576,4 +1831,3 @@ const char * llama_print_system_info(void) { return s.c_str(); } - diff --git a/llama.h b/llama.h index 3df9ed1fd..9943d96ba 100644 --- a/llama.h +++ b/llama.h @@ -53,6 +53,8 @@ extern "C" { bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights + bool use_mlock; // force system to keep model in RAM + bool embedding; // embedding mode only }; LLAMA_API struct llama_context_params llama_context_default_params(); @@ -108,6 +110,10 @@ extern "C" { // Cols: n_vocab LLAMA_API float * llama_get_logits(struct llama_context * ctx); + // Get the embeddings for the input + // shape: [n_embd] (1-dimensional) + LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); + // Token Id -> String. Uses the vocabulary in the provided context LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token); diff --git a/llamacpp.dll b/llamacpp.dll index 0430f18fa..4b09ce6a6 100644 Binary files a/llamacpp.dll and b/llamacpp.dll differ diff --git a/main.cpp b/main.cpp index 5ba6d5a75..3f49ad997 100644 --- a/main.cpp +++ b/main.cpp @@ -199,6 +199,8 @@ int main(int argc, char ** argv) { lparams.seed = params.seed; lparams.f16_kv = params.memory_f16; lparams.logits_all = params.perplexity; + lparams.use_mlock = params.use_mlock; + lparams.embedding = params.embedding; ctx = llama_init_from_file(params.model.c_str(), lparams); @@ -215,11 +217,23 @@ int main(int argc, char ** argv) { params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); } - // determine the required inference memory per token: - // TODO: better way to do that - { - const std::vector tmp = { 0, 1, 2, 3 }; - llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); + // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters + // uncomment the "used_mem" line in llama.cpp to see the results + if (params.mem_test) { + { + const std::vector tmp(params.n_batch, 0); + llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); + } + + { + const std::vector tmp = { 0, }; + llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads); + } + + llama_print_timings(ctx); + llama_free(ctx); + + return 0; } if (params.perplexity) { @@ -292,6 +306,7 @@ int main(int argc, char ** argv) { std::vector embd; + int last_n_size = params.repeat_last_n; std::vector last_n_tokens(last_n_size); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); @@ -324,6 +339,27 @@ int main(int argc, char ** argv) { // the first thing we will do is to output the prompt, so set color accordingly set_console_state(CONSOLE_STATE_PROMPT); + if (params.embedding){ + embd = embd_inp; + + if (embd.size() > 0) { + if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return 1; + } + } + + const auto embeddings = llama_get_embeddings(ctx); + + // TODO: print / use the embeddings + + if (params.use_color) { + printf(ANSI_COLOR_RESET); + } + + return 0; + } + while (remaining_tokens > 0 || params.interactive) { // predict if (embd.size() > 0) { @@ -336,7 +372,7 @@ int main(int argc, char ** argv) { n_past += embd.size(); embd.clear(); - if ((int) embd_inp.size() <= input_consumed) { + if ((int) embd_inp.size() <= input_consumed && !is_interacting) { // out of user input, sample next token const float top_k = params.top_k; const float top_p = params.top_p; @@ -363,7 +399,7 @@ int main(int argc, char ** argv) { } // replace end of text token with newline token when in interactive mode - if (id == llama_token_eos() && params.interactive) { + if (id == llama_token_eos() && params.interactive && !params.instruct) { id = llama_token_newline.front(); if (params.antiprompt.size() != 0) { // tokenize and inject first reverse prompt @@ -415,13 +451,16 @@ int main(int argc, char ** argv) { } // Check if each of the reverse prompts appears at the end of the output. - for (std::string antiprompt : params.antiprompt) { + for (std::string & antiprompt : params.antiprompt) { if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) { is_interacting = true; + set_console_state(CONSOLE_STATE_USER_INPUT); + fflush(stdout); break; } } - if (is_interacting) { + + if (n_past > 0 && is_interacting) { // potentially set color to indicate we are taking user input set_console_state(CONSOLE_STATE_USER_INPUT); @@ -459,13 +498,20 @@ int main(int argc, char ** argv) { input_noecho = true; // do not echo this again } - is_interacting = false; + + if (n_past > 0) { + is_interacting = false; + } } // end of text token if (embd.back() == llama_token_eos()) { - fprintf(stderr, " [end of text]\n"); - break; + if (params.instruct) { + is_interacting = true; + } else { + fprintf(stderr, " [end of text]\n"); + break; + } } // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. @@ -480,7 +526,6 @@ int main(int argc, char ** argv) { #endif llama_print_timings(ctx); - llama_free(ctx); set_console_state(CONSOLE_STATE_DEFAULT); diff --git a/main.exe b/main.exe index 47624ee83..bc89eb362 100644 Binary files a/main.exe and b/main.exe differ diff --git a/quantize.exe b/quantize.exe index c90b88d73..979c0f4b2 100644 Binary files a/quantize.exe and b/quantize.exe differ diff --git a/utils.cpp b/utils.cpp index 45c9cabb1..2f995c12d 100644 --- a/utils.cpp +++ b/utils.cpp @@ -1,3 +1,5 @@ +#include "ggml.h" + #include "utils.h" #include @@ -77,8 +79,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--memory_f16") { - params.memory_f16 = true; + } else if (arg == "--memory_f32") { + params.memory_f16 = false; } else if (arg == "--top_p") { if (++i >= argc) { invalid_param = true; @@ -109,6 +111,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); } else if (arg == "-m" || arg == "--model") { if (++i >= argc) { invalid_param = true; @@ -117,12 +120,20 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { params.model = argv[i]; } else if (arg == "-i" || arg == "--interactive") { params.interactive = true; + } else if (arg == "--embedding") { + params.embedding = true; + } else if (arg == "--interactive-start") { + params.interactive = true; } else if (arg == "--interactive-first") { params.interactive_start = true; } else if (arg == "-ins" || arg == "--instruct") { params.instruct = true; } else if (arg == "--color") { params.use_color = true; + } else if (arg == "--mlock") { + params.use_mlock = true; + } else if (arg == "--mtest") { + params.mem_test = true; } else if (arg == "-r" || arg == "--reverse-prompt") { if (++i >= argc) { invalid_param = true; @@ -185,11 +196,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty); fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n"); - fprintf(stderr, " --memory_f16 use f16 instead of f32 for memory key+value\n"); + fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n"); fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n"); fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); + if (ggml_mlock_supported()) { + fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + } + fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, "\n"); diff --git a/utils.h b/utils.h index b0de556c9..d469bc6a0 100644 --- a/utils.h +++ b/utils.h @@ -14,12 +14,13 @@ // struct gpt_params { - int32_t seed = -1; // RNG seed + int32_t seed = -1; // RNG seed int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); - int32_t n_predict = 128; // new tokens to predict - int32_t repeat_last_n = 64; // last n tokens to penalize - int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) - int32_t n_ctx = 512; //context size + int32_t n_predict = 128; // new tokens to predict + int32_t repeat_last_n = 64; // last n tokens to penalize + int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) + int32_t n_ctx = 512; // context size + int32_t n_batch = 8; // batch size for prompt processing // sampling parameters int32_t top_k = 40; @@ -27,21 +28,25 @@ struct gpt_params { float temp = 0.80f; float repeat_penalty = 1.10f; - int32_t n_batch = 8; // batch size for prompt processing - std::string model = "models/lamma-7B/ggml-model.bin"; // model path std::string prompt = ""; + std::vector antiprompt; // string upon seeing which more user input is prompted - bool memory_f16 = false; // use f16 instead of f32 for memory kv + bool memory_f16 = true; // use f16 instead of f32 for memory kv bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode + + bool embedding = false; // get only sentence embedding bool interactive_start = false; // wait for user input immediately + bool instruct = false; // instruction mode (used for Alpaca models) bool ignore_eos = false; // do not stop generating after eos bool perplexity = false; // compute perplexity over the prompt + bool use_mlock = false; // use mlock to keep model in memory + bool mem_test = false; // compute maximum memory usage }; bool gpt_params_parse(int argc, char ** argv, gpt_params & params);