Merge branch 'master' into concedo
# Conflicts: # README.md
This commit is contained in:
commit
3c78124aac
15 changed files with 591 additions and 118 deletions
3
Makefile
3
Makefile
|
@ -160,7 +160,8 @@ endif
|
|||
ifneq ($(filter ppc64%,$(UNAME_M)),)
|
||||
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
|
||||
ifneq (,$(findstring POWER9,$(POWER9_M)))
|
||||
CFLAGS += -mpower9-vector
|
||||
CFLAGS += -mcpu=power9
|
||||
CXXFLAGS += -mcpu=power9
|
||||
endif
|
||||
# Require c++23's std::byteswap for big-endian support.
|
||||
ifeq ($(UNAME_M),ppc64)
|
||||
|
|
|
@ -3,4 +3,4 @@
|
|||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
|
||||
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
|
||||
|
|
2
chat.sh
2
chat.sh
|
@ -3,4 +3,4 @@
|
|||
# Temporary script - will be removed in the future
|
||||
#
|
||||
|
||||
./main -m ./models/7B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||
./main -m ./models/7B/ggml-model-q4_0.bin -b 128 -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||
|
|
|
@ -13,7 +13,7 @@ N_PREDICTS="${N_PREDICTS:-2048}"
|
|||
|
||||
# Note: you can also override the generation options by specifying them on the command line:
|
||||
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
|
||||
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --repeat_penalty 1.17647}"
|
||||
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
|
||||
|
||||
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||
./main $GEN_OPTIONS \
|
||||
|
|
|
@ -192,7 +192,7 @@ extern "C" {
|
|||
std::string concat_output = "";
|
||||
|
||||
bool startedsampling = false;
|
||||
printf("\nProcessing Prompt: ");
|
||||
printf("\nProcessing Prompt (%d tokens): ",embd_inp.size());
|
||||
|
||||
while (remaining_tokens > 0)
|
||||
{
|
||||
|
@ -224,7 +224,7 @@ extern "C" {
|
|||
if(!startedsampling)
|
||||
{
|
||||
startedsampling = true;
|
||||
printf("\nGenerating: ");
|
||||
printf("\nGenerating (%d tokens): ",params.n_predict);
|
||||
}
|
||||
|
||||
{
|
||||
|
|
174
ggml.c
174
ggml.c
|
@ -1,5 +1,5 @@
|
|||
// Defines CLOCK_MONOTONIC on Linux
|
||||
#define _POSIX_C_SOURCE 199309L
|
||||
// Defines CLOCK_MONOTONIC and asprintf on Linux
|
||||
#define _GNU_SOURCE
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
|
@ -10,6 +10,7 @@
|
|||
#endif
|
||||
|
||||
#include <assert.h>
|
||||
#include <errno.h>
|
||||
#include <time.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -31,7 +32,6 @@
|
|||
#else
|
||||
// ref: https://github.com/ggerganov/whisper.cpp/issues/168
|
||||
#include <windows.h>
|
||||
#include <errno.h>
|
||||
#endif
|
||||
|
||||
typedef volatile LONG atomic_int;
|
||||
|
@ -83,6 +83,17 @@ typedef void* thread_ret_t;
|
|||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#endif
|
||||
|
||||
#define GGML_MLOCK_SUPPORT 0
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<sys/mman.h>)
|
||||
#undef GGML_MLOCK_SUPPORT
|
||||
#define GGML_MLOCK_SUPPORT 1
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/*#define GGML_PERF*/
|
||||
#define GGML_DEBUG 0
|
||||
#define GGML_GELU_FP16
|
||||
|
@ -164,6 +175,39 @@ typedef double ggml_float;
|
|||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
/* the inline asm below is about 12% faster than the lookup method */
|
||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
register float f;
|
||||
register double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
"frsp %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=f"(f):
|
||||
/* in */ "r"(h));
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
register double d;
|
||||
register ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=r"(r):
|
||||
/* in */ "f"(f));
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// FP16 <-> FP32
|
||||
|
@ -261,6 +305,7 @@ static float table_f32_f16[1 << 16];
|
|||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
|
||||
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
|
@ -451,7 +496,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric
|
|||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
||||
assert(k % QK == 0);
|
||||
|
||||
#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__)
|
||||
#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
|
||||
const int nb = k / QK;
|
||||
const size_t bs = sizeof(float) + QK/2;
|
||||
|
||||
|
@ -461,7 +506,52 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
|
|||
uint8_t pp[QK/2];
|
||||
#endif
|
||||
|
||||
#if __ARM_NEON
|
||||
#if defined(__POWER9_VECTOR__)
|
||||
#if QK == 32
|
||||
const vector float v85 = vec_splats(8.5f);
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
||||
vector float srcv [8];
|
||||
vector float asrcv[8];
|
||||
vector float amaxv[8];
|
||||
|
||||
for (int l = 0; l < 8; l++) srcv[l] = *(vector float *)(x + i*32 + 4*l);
|
||||
for (int l = 0; l < 8; l++) asrcv[l] = vec_abs(srcv[l]);
|
||||
|
||||
for (int l = 0; l < 4; l++) amaxv[2*l] = vec_max(asrcv[2*l], asrcv[2*l+1]);
|
||||
//for (int l = 0; l < 2; l++) amaxv[4*l] = vec_max(amaxv[4*l], amaxv[4*l+2]);
|
||||
amaxv[0] = vec_max(amaxv[0], amaxv[2]);
|
||||
amaxv[4] = vec_max(amaxv[4], amaxv[6]);
|
||||
//for (int l = 0; l < 1; l++) amaxv[8*l] = vec_max(amaxv[8*l], amaxv[8*l+4]);
|
||||
amaxv[0] = vec_max(amaxv[0], amaxv[4]);
|
||||
|
||||
amax = MAX(
|
||||
MAX(vec_extract(amaxv[0], 0), vec_extract(amaxv[0], 1)),
|
||||
MAX(vec_extract(amaxv[0], 2), vec_extract(amaxv[0], 3)));
|
||||
|
||||
const float d = amax / ((1 << 3) - 1);
|
||||
const float id = d ? 1.0/d : 0.0;
|
||||
|
||||
*(float *)pd = d;
|
||||
pd += bs;
|
||||
|
||||
const vector float vid = vec_splats(id);
|
||||
for (int l = 0; l < 8; l++) {
|
||||
const vector float vf = vec_madd(srcv[l], vid, v85);
|
||||
const vector signed int vi = vec_signed(vf);
|
||||
|
||||
pb[2*l + 0] = vec_extract(vi, 0) | (vec_extract(vi, 1) << 4);
|
||||
pb[2*l + 1] = vec_extract(vi, 2) | (vec_extract(vi, 3) << 4);
|
||||
}
|
||||
|
||||
//memcpy(pb, pp, sizeof(pp));
|
||||
pb += bs;
|
||||
}
|
||||
#else
|
||||
#error "not implemented for QK"
|
||||
#endif
|
||||
#elif __ARM_NEON
|
||||
#if QK == 32
|
||||
for (int i = 0; i < nb; i++) {
|
||||
float amax = 0.0f; // absolute max
|
||||
|
@ -2344,6 +2434,7 @@ struct ggml_context {
|
|||
size_t mem_size;
|
||||
void * mem_buffer;
|
||||
bool mem_buffer_owned;
|
||||
bool mem_buffer_mlocked;
|
||||
|
||||
int n_objects;
|
||||
|
||||
|
@ -2619,16 +2710,19 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||
}
|
||||
|
||||
*ctx = (struct ggml_context) {
|
||||
/*.mem_size =*/ params.mem_size,
|
||||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
|
||||
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
||||
/*.n_objects =*/ 0,
|
||||
/*.objects_begin =*/ NULL,
|
||||
/*.objects_end =*/ NULL,
|
||||
/*.scratch =*/ { 0, 0, NULL, },
|
||||
/*.scratch_save =*/ { 0, 0, NULL, },
|
||||
/*.mem_size =*/ params.mem_size,
|
||||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
|
||||
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
||||
/*.mem_buffer_mlocked =*/ false,
|
||||
/*.n_objects =*/ 0,
|
||||
/*.objects_begin =*/ NULL,
|
||||
/*.objects_end =*/ NULL,
|
||||
/*.scratch =*/ { 0, 0, NULL, },
|
||||
/*.scratch_save =*/ { 0, 0, NULL, },
|
||||
};
|
||||
|
||||
GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure
|
||||
|
||||
ggml_assert_aligned(ctx->mem_buffer);
|
||||
|
||||
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
||||
|
@ -2651,6 +2745,14 @@ void ggml_free(struct ggml_context * ctx) {
|
|||
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
|
||||
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
|
||||
|
||||
#if GGML_MLOCK_SUPPORT
|
||||
if (ctx->mem_buffer_mlocked) {
|
||||
if (munlock(ctx->mem_buffer, ctx->mem_size)) {
|
||||
fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (ctx->mem_buffer_owned) {
|
||||
free(ctx->mem_buffer);
|
||||
}
|
||||
|
@ -2679,6 +2781,37 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
|
|||
return result;
|
||||
}
|
||||
|
||||
bool ggml_mlock_supported(void) {
|
||||
return GGML_MLOCK_SUPPORT;
|
||||
}
|
||||
|
||||
#if GGML_MLOCK_SUPPORT
|
||||
#ifdef __APPLE__
|
||||
#define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)."
|
||||
#else
|
||||
#define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
|
||||
#endif
|
||||
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
|
||||
if (ctx->mem_buffer_mlocked) {
|
||||
return true;
|
||||
}
|
||||
if (mlock(ctx->mem_buffer, ctx->mem_size)) {
|
||||
int ret = asprintf(err_p, "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
|
||||
ctx->mem_size, strerror(errno));
|
||||
GGML_ASSERT(ret >= 0);
|
||||
return false;
|
||||
}
|
||||
ctx->mem_buffer_mlocked = true;
|
||||
return true;
|
||||
}
|
||||
#else // GGML_MLOCK_SUPPORT
|
||||
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
|
||||
*err_p = strdup("can't mlock because it's not supported on this system");
|
||||
return false;
|
||||
}
|
||||
#endif // GGML_MLOCK_SUPPORT
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_impl(
|
||||
|
@ -5713,17 +5846,28 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
UNUSED(src0);
|
||||
const int ne00 = src0->ne[0];
|
||||
const int ne01 = src0->ne[1];
|
||||
|
||||
const int ne10 = src1->ne[0];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
const int ne1 = dst->ne[1];
|
||||
|
||||
// TMP: disable BLAS for now there is definitely a bug
|
||||
return false;
|
||||
|
||||
// TODO: find the optimal values for these
|
||||
if (ggml_is_contiguous(src0) &&
|
||||
ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
|
||||
//printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
|
||||
|
||||
// disable BLAS for Q4_0 and Q4_1
|
||||
// there is a bug that has to be fixed before enabling
|
||||
if (src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
3
ggml.h
3
ggml.h
|
@ -343,6 +343,9 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
|
|||
|
||||
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
||||
|
||||
bool ggml_mlock_supported(void);
|
||||
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
|
|
400
llama.cpp
400
llama.cpp
|
@ -5,12 +5,25 @@
|
|||
#include <cinttypes>
|
||||
#include <fstream>
|
||||
#include <random>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <queue>
|
||||
#include <regex>
|
||||
#include <cassert>
|
||||
#include <cstring>
|
||||
|
||||
#define LLAMA_USE_SCRATCH
|
||||
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
||||
|
||||
#define LLAMA_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
|
||||
// determine number of model parts based on the dimension
|
||||
static const std::unordered_map<int, int> LLAMA_N_PARTS = {
|
||||
{ 4096, 1 },
|
||||
|
@ -19,6 +32,52 @@ static const std::unordered_map<int, int> LLAMA_N_PARTS = {
|
|||
{ 8192, 8 },
|
||||
};
|
||||
|
||||
// available llama models
|
||||
enum e_model {
|
||||
MODEL_UNKNOWN,
|
||||
MODEL_7B,
|
||||
MODEL_13B,
|
||||
MODEL_30B,
|
||||
MODEL_65B,
|
||||
};
|
||||
|
||||
static const size_t MB = 1024*1024;
|
||||
|
||||
// computed for n_ctx == 2048
|
||||
// TODO: dynamically determine these sizes
|
||||
// needs modifications in ggml
|
||||
|
||||
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
|
||||
{ MODEL_7B, 512ull*MB },
|
||||
{ MODEL_13B, 512ull*MB },
|
||||
{ MODEL_30B, 512ull*MB },
|
||||
{ MODEL_65B, 512ull*MB },
|
||||
};
|
||||
|
||||
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
|
||||
{ MODEL_7B, 512ull*MB },
|
||||
{ MODEL_13B, 512ull*MB },
|
||||
{ MODEL_30B, 512ull*MB },
|
||||
{ MODEL_65B, 512ull*MB },
|
||||
};
|
||||
|
||||
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
||||
static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
|
||||
{ MODEL_7B, 1026ull*MB },
|
||||
{ MODEL_13B, 1608ull*MB },
|
||||
{ MODEL_30B, 3124ull*MB },
|
||||
{ MODEL_65B, 5120ull*MB },
|
||||
};
|
||||
|
||||
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
||||
// not actually needed if BLAS is disabled
|
||||
static const std::map<e_model, size_t> MEM_REQ_EVAL = {
|
||||
{ MODEL_7B, 768ull*MB },
|
||||
{ MODEL_13B, 1024ull*MB },
|
||||
{ MODEL_30B, 1280ull*MB },
|
||||
{ MODEL_65B, 1536ull*MB },
|
||||
};
|
||||
|
||||
// default hparams (LLaMA 7B)
|
||||
struct llama_hparams {
|
||||
int32_t n_vocab = 32000;
|
||||
|
@ -50,7 +109,20 @@ struct llama_layer {
|
|||
struct ggml_tensor * w3;
|
||||
};
|
||||
|
||||
struct llama_kv_cache {
|
||||
struct ggml_tensor * k;
|
||||
struct ggml_tensor * v;
|
||||
|
||||
struct ggml_context * ctx;
|
||||
|
||||
std::vector<uint8_t> buf;
|
||||
|
||||
int n; // number of tokens currently in the cache
|
||||
};
|
||||
|
||||
struct llama_model {
|
||||
e_model type = MODEL_UNKNOWN;
|
||||
|
||||
llama_hparams hparams;
|
||||
|
||||
struct ggml_tensor * tok_embeddings;
|
||||
|
@ -60,12 +132,18 @@ struct llama_model {
|
|||
|
||||
std::vector<llama_layer> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_tensor * memory_k;
|
||||
struct ggml_tensor * memory_v;
|
||||
|
||||
//
|
||||
// context
|
||||
struct ggml_context * ctx;
|
||||
|
||||
// key + value cache for the self attention
|
||||
// TODO: move to llama_state
|
||||
struct llama_kv_cache kv_self;
|
||||
|
||||
// the model memory buffer
|
||||
std::vector<uint8_t> buf;
|
||||
|
||||
// tensors
|
||||
int n_loaded;
|
||||
std::unordered_map<std::string, struct ggml_tensor *> tensors;
|
||||
};
|
||||
|
||||
|
@ -102,8 +180,91 @@ struct llama_context {
|
|||
// decode output (2-dimensional array: [n_tokens][n_vocab])
|
||||
std::vector<float> logits;
|
||||
bool logits_all = false;
|
||||
|
||||
// input embedding (1-dimensional array: [n_embd])
|
||||
std::vector<float> embedding;
|
||||
|
||||
// memory buffers used to evaluate the model
|
||||
// TODO: move in llama_state
|
||||
std::vector<uint8_t> buf_compute;
|
||||
std::vector<uint8_t> buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
||||
|
||||
int buf_last = 0;
|
||||
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
||||
|
||||
void use_buf(struct ggml_context * ctx, int i) {
|
||||
#if defined(LLAMA_USE_SCRATCH)
|
||||
size_t last_size = 0;
|
||||
|
||||
if (i == -1) {
|
||||
last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
|
||||
} else {
|
||||
auto & buf = buf_scratch[i];
|
||||
last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
|
||||
}
|
||||
|
||||
if (buf_last >= 0) {
|
||||
buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
|
||||
}
|
||||
|
||||
buf_last = i;
|
||||
#else
|
||||
(void) i;
|
||||
(void) ctx;
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t get_buf_max_mem(int i) const {
|
||||
#if defined(LLAMA_USE_SCRATCH)
|
||||
return buf_max_size[i];
|
||||
#else
|
||||
(void) i;
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// kv cache
|
||||
//
|
||||
|
||||
static bool kv_cache_init(
|
||||
const struct llama_hparams & hparams,
|
||||
struct llama_kv_cache & cache,
|
||||
ggml_type wtype,
|
||||
int n_ctx) {
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
|
||||
const int n_mem = n_layer*n_ctx;
|
||||
const int n_elements = n_embd*n_mem;
|
||||
|
||||
cache.buf.resize(2*n_elements*ggml_type_size(wtype) + 2u*MB);
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = cache.buf.size();
|
||||
params.mem_buffer = cache.buf.data();
|
||||
|
||||
cache.ctx = ggml_init(params);
|
||||
|
||||
if (!cache.ctx) {
|
||||
fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
||||
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void kv_cache_free(struct llama_kv_cache & cache) {
|
||||
if (cache.ctx) {
|
||||
ggml_free(cache.ctx);
|
||||
cache.ctx = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
struct llama_context_params llama_context_default_params() {
|
||||
struct llama_context_params result = {
|
||||
/*.n_ctx =*/ 512,
|
||||
|
@ -112,6 +273,8 @@ struct llama_context_params llama_context_default_params() {
|
|||
/*.f16_kv =*/ false,
|
||||
/*.logits_all =*/ false,
|
||||
/*.vocab_only =*/ false,
|
||||
/*.use_mlock =*/ false,
|
||||
/*.embedding =*/ false,
|
||||
};
|
||||
|
||||
return result;
|
||||
|
@ -203,6 +366,22 @@ static bool llama_model_load(
|
|||
fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
|
||||
}
|
||||
|
||||
if (hparams.n_layer == 32) {
|
||||
model.type = e_model::MODEL_7B;
|
||||
}
|
||||
|
||||
if (hparams.n_layer == 40) {
|
||||
model.type = e_model::MODEL_13B;
|
||||
}
|
||||
|
||||
if (hparams.n_layer == 60) {
|
||||
model.type = e_model::MODEL_30B;
|
||||
}
|
||||
|
||||
if (hparams.n_layer == 80) {
|
||||
model.type = e_model::MODEL_65B;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
|
@ -213,6 +392,7 @@ static bool llama_model_load(
|
|||
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
|
||||
fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
|
||||
fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
|
||||
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
|
||||
}
|
||||
|
||||
// load vocab
|
||||
|
@ -309,11 +489,32 @@ static bool llama_model_load(
|
|||
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||
}
|
||||
|
||||
// print memory requirements
|
||||
{
|
||||
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
||||
|
||||
// this is the total memory required to run the inference
|
||||
const size_t mem_required =
|
||||
ctx_size +
|
||||
MEM_REQ_SCRATCH0.at(model.type) +
|
||||
MEM_REQ_SCRATCH1.at(model.type) +
|
||||
MEM_REQ_EVAL.at (model.type);
|
||||
|
||||
// this is the memory required by one llama_state
|
||||
const size_t mem_required_state =
|
||||
scale*MEM_REQ_KV_SELF.at(model.type);
|
||||
|
||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
lctx.model.buf.resize(ctx_size);
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.mem_size =*/ lctx.model.buf.size(),
|
||||
/*.mem_buffer =*/ lctx.model.buf.data(),
|
||||
};
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
|
@ -376,25 +577,6 @@ static bool llama_model_load(
|
|||
}
|
||||
}
|
||||
|
||||
// key + value memory
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
|
||||
const int n_mem = n_layer*n_ctx;
|
||||
const int n_elements = n_embd*n_mem;
|
||||
|
||||
model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
|
||||
model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);
|
||||
|
||||
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
||||
|
||||
fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
||||
}
|
||||
|
||||
const size_t file_offset = fin.tellg();
|
||||
|
||||
fin.close();
|
||||
|
@ -418,9 +600,10 @@ static bool llama_model_load(
|
|||
|
||||
// load weights
|
||||
{
|
||||
int n_tensors = 0;
|
||||
size_t total_size = 0;
|
||||
|
||||
model.n_loaded = 0;
|
||||
|
||||
fprintf(stderr, "%s: ", __func__);
|
||||
|
||||
while (true) {
|
||||
|
@ -585,7 +768,10 @@ static bool llama_model_load(
|
|||
}
|
||||
|
||||
//fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
if (++n_tensors % 8 == 0) {
|
||||
model.n_loaded++;
|
||||
|
||||
// progress
|
||||
if (model.n_loaded % 8 == 0) {
|
||||
fprintf(stderr, ".");
|
||||
fflush(stderr);
|
||||
}
|
||||
|
@ -593,14 +779,18 @@ static bool llama_model_load(
|
|||
|
||||
fprintf(stderr, " done\n");
|
||||
|
||||
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
||||
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
|
||||
if (model.n_loaded == 0) {
|
||||
fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
||||
} else if (model.n_loaded != (int) model.tensors.size()) {
|
||||
fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
fin.close();
|
||||
}
|
||||
|
||||
lctx.logits.reserve(lctx.model.hparams.n_ctx);
|
||||
|
||||
lctx.t_load_us = ggml_time_us() - t_start_us;
|
||||
|
||||
return true;
|
||||
|
@ -626,6 +816,10 @@ static bool llama_eval_internal(
|
|||
const auto & model = lctx.model;
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
auto & kv_self = model.kv_self;
|
||||
|
||||
LLAMA_ASSERT(!!kv_self.ctx);
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
|
@ -634,27 +828,11 @@ static bool llama_eval_internal(
|
|||
const int n_rot = hparams.n_embd/hparams.n_head;
|
||||
|
||||
auto & mem_per_token = lctx.mem_per_token;
|
||||
|
||||
// TODO: fix this hardcoded size
|
||||
static size_t buf_size = 512u*1024*1024;
|
||||
static void * buf = malloc(buf_size);
|
||||
|
||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||
const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead
|
||||
//fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||
|
||||
// reallocate
|
||||
buf_size = buf_size_new;
|
||||
buf = realloc(buf, buf_size);
|
||||
if (buf == nullptr) {
|
||||
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
auto & buf_compute = lctx.buf_compute;
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_size,
|
||||
/*.mem_buffer =*/ buf,
|
||||
/*.mem_size =*/ buf_compute.size(),
|
||||
/*.mem_buffer =*/ buf_compute.data(),
|
||||
};
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
|
@ -671,6 +849,8 @@ static bool llama_eval_internal(
|
|||
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
lctx.use_buf(ctx0, 0);
|
||||
|
||||
// norm
|
||||
{
|
||||
cur = ggml_rms_norm(ctx0, inpL);
|
||||
|
@ -689,8 +869,8 @@ static bool llama_eval_internal(
|
|||
|
||||
// store key and value to memory
|
||||
if (N >= 1) {
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
|
||||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||
|
@ -711,7 +891,7 @@ static bool llama_eval_internal(
|
|||
ggml_permute(ctx0,
|
||||
ggml_rope(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
|
||||
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
n_past, n_rot, 1),
|
||||
0, 2, 1, 3);
|
||||
|
@ -737,7 +917,7 @@ static bool llama_eval_internal(
|
|||
ggml_cpy(ctx0,
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
||||
ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.v)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
1, 2, 0, 3),
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
|
||||
|
@ -759,6 +939,8 @@ static bool llama_eval_internal(
|
|||
cur);
|
||||
}
|
||||
|
||||
lctx.use_buf(ctx0, 1);
|
||||
|
||||
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
|
||||
|
||||
// feed-forward network
|
||||
|
@ -777,7 +959,6 @@ static bool llama_eval_internal(
|
|||
model.layers[il].w3,
|
||||
cur);
|
||||
|
||||
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].w1,
|
||||
cur);
|
||||
|
@ -792,26 +973,34 @@ static bool llama_eval_internal(
|
|||
cur);
|
||||
}
|
||||
|
||||
cur = ggml_add(ctx0, cur, inpFF);
|
||||
cur = ggml_add(ctx0, cur, inpFF);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
lctx.use_buf(ctx0, 0);
|
||||
|
||||
// used at the end to optionally extract the embeddings
|
||||
struct ggml_tensor * embeddings = NULL;
|
||||
|
||||
// norm
|
||||
{
|
||||
|
||||
inpL = ggml_rms_norm(ctx0, inpL);
|
||||
|
||||
// inpL = norm*inpL
|
||||
inpL = ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.norm, inpL),
|
||||
inpL);
|
||||
|
||||
embeddings = inpL;
|
||||
}
|
||||
|
||||
// lm_head
|
||||
{
|
||||
inpL = ggml_mul_mat(ctx0, model.output, inpL);
|
||||
}
|
||||
inpL = ggml_mul_mat(ctx0, model.output, inpL);
|
||||
|
||||
lctx.use_buf(ctx0, -1);
|
||||
|
||||
// logits -> probs
|
||||
//inpL = ggml_soft_max(ctx0, inpL);
|
||||
|
@ -828,21 +1017,38 @@ static bool llama_eval_internal(
|
|||
//embd_w.resize(n_vocab*N);
|
||||
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||
|
||||
auto & logits_out = lctx.logits;
|
||||
// extract logits
|
||||
{
|
||||
auto & logits_out = lctx.logits;
|
||||
|
||||
if (lctx.logits_all) {
|
||||
logits_out.resize(n_vocab * N);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||
} else {
|
||||
// return result for just the last token
|
||||
logits_out.resize(n_vocab);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||
if (lctx.logits_all) {
|
||||
logits_out.resize(n_vocab * N);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||
} else {
|
||||
// return result for just the last token
|
||||
logits_out.resize(n_vocab);
|
||||
memcpy(logits_out.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||
}
|
||||
}
|
||||
|
||||
// extract embeddings
|
||||
if (lctx.embedding.size()) {
|
||||
auto & embedding_out = lctx.embedding;
|
||||
|
||||
embedding_out.resize(n_embd);
|
||||
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
||||
}
|
||||
|
||||
if (mem_per_token == 0) {
|
||||
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||
}
|
||||
//fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
|
||||
|
||||
#if 0
|
||||
printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__,
|
||||
ggml_used_mem(ctx0)/1024.0/1024.0,
|
||||
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
||||
lctx.get_buf_max_mem(1)/1024.0/1024.0);
|
||||
#endif
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
|
@ -1415,19 +1621,64 @@ struct llama_context * llama_init_from_file(
|
|||
ctx->rng = std::mt19937(params.seed);
|
||||
ctx->logits_all = params.logits_all;
|
||||
|
||||
ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only)) {
|
||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, memory_type,
|
||||
params.vocab_only)) {
|
||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||
delete ctx;
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (params.use_mlock) {
|
||||
char *err;
|
||||
if (!ggml_mlock(ctx->model.ctx, &err)) {
|
||||
fprintf(stderr, "%s\n", err);
|
||||
free(err);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// reserve memory for context buffers
|
||||
{
|
||||
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
|
||||
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||
llama_free(ctx);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
{
|
||||
const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v);
|
||||
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
const auto & hparams = ctx->model.hparams;
|
||||
if (params.logits_all) {
|
||||
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
|
||||
} else {
|
||||
ctx->logits.reserve(hparams.n_ctx);
|
||||
}
|
||||
|
||||
if (params.embedding){
|
||||
ctx->embedding.reserve(hparams.n_embd);
|
||||
}
|
||||
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
|
||||
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void llama_free(struct llama_context * ctx) {
|
||||
ggml_free(ctx->model.ctx);
|
||||
kv_cache_free(ctx->model.kv_self);
|
||||
|
||||
if (ctx->model.ctx) {
|
||||
ggml_free(ctx->model.ctx);
|
||||
}
|
||||
|
||||
delete ctx;
|
||||
}
|
||||
|
@ -1491,6 +1742,10 @@ float * llama_get_logits(struct llama_context * ctx) {
|
|||
return ctx->logits.data();
|
||||
}
|
||||
|
||||
float * llama_get_embeddings(struct llama_context * ctx) {
|
||||
return ctx->embedding.data();
|
||||
}
|
||||
|
||||
const char * llama_token_to_str(struct llama_context * ctx, llama_token token) {
|
||||
if (token >= llama_n_vocab(ctx)) {
|
||||
return nullptr;
|
||||
|
@ -1576,4 +1831,3 @@ const char * llama_print_system_info(void) {
|
|||
|
||||
return s.c_str();
|
||||
}
|
||||
|
||||
|
|
6
llama.h
6
llama.h
|
@ -53,6 +53,8 @@ extern "C" {
|
|||
bool f16_kv; // use fp16 for KV cache
|
||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||
bool vocab_only; // only load the vocabulary, no weights
|
||||
bool use_mlock; // force system to keep model in RAM
|
||||
bool embedding; // embedding mode only
|
||||
};
|
||||
|
||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||
|
@ -108,6 +110,10 @@ extern "C" {
|
|||
// Cols: n_vocab
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the input
|
||||
// shape: [n_embd] (1-dimensional)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Token Id -> String. Uses the vocabulary in the provided context
|
||||
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
|
||||
|
||||
|
|
BIN
llamacpp.dll
BIN
llamacpp.dll
Binary file not shown.
71
main.cpp
71
main.cpp
|
@ -199,6 +199,8 @@ int main(int argc, char ** argv) {
|
|||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.logits_all = params.perplexity;
|
||||
lparams.use_mlock = params.use_mlock;
|
||||
lparams.embedding = params.embedding;
|
||||
|
||||
ctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||
|
||||
|
@ -215,11 +217,23 @@ int main(int argc, char ** argv) {
|
|||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||
}
|
||||
|
||||
// determine the required inference memory per token:
|
||||
// TODO: better way to do that
|
||||
{
|
||||
const std::vector<llama_token> tmp = { 0, 1, 2, 3 };
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||
// determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters
|
||||
// uncomment the "used_mem" line in llama.cpp to see the results
|
||||
if (params.mem_test) {
|
||||
{
|
||||
const std::vector<llama_token> tmp(params.n_batch, 0);
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||
}
|
||||
|
||||
{
|
||||
const std::vector<llama_token> tmp = { 0, };
|
||||
llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads);
|
||||
}
|
||||
|
||||
llama_print_timings(ctx);
|
||||
llama_free(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (params.perplexity) {
|
||||
|
@ -292,6 +306,7 @@ int main(int argc, char ** argv) {
|
|||
|
||||
std::vector<llama_token> embd;
|
||||
|
||||
|
||||
int last_n_size = params.repeat_last_n;
|
||||
std::vector<llama_token> last_n_tokens(last_n_size);
|
||||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
||||
|
@ -324,6 +339,27 @@ int main(int argc, char ** argv) {
|
|||
// the first thing we will do is to output the prompt, so set color accordingly
|
||||
set_console_state(CONSOLE_STATE_PROMPT);
|
||||
|
||||
if (params.embedding){
|
||||
embd = embd_inp;
|
||||
|
||||
if (embd.size() > 0) {
|
||||
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
|
||||
fprintf(stderr, "%s : failed to eval\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const auto embeddings = llama_get_embeddings(ctx);
|
||||
|
||||
// TODO: print / use the embeddings
|
||||
|
||||
if (params.use_color) {
|
||||
printf(ANSI_COLOR_RESET);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
while (remaining_tokens > 0 || params.interactive) {
|
||||
// predict
|
||||
if (embd.size() > 0) {
|
||||
|
@ -336,7 +372,7 @@ int main(int argc, char ** argv) {
|
|||
n_past += embd.size();
|
||||
embd.clear();
|
||||
|
||||
if ((int) embd_inp.size() <= input_consumed) {
|
||||
if ((int) embd_inp.size() <= input_consumed && !is_interacting) {
|
||||
// out of user input, sample next token
|
||||
const float top_k = params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
|
@ -363,7 +399,7 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// replace end of text token with newline token when in interactive mode
|
||||
if (id == llama_token_eos() && params.interactive) {
|
||||
if (id == llama_token_eos() && params.interactive && !params.instruct) {
|
||||
id = llama_token_newline.front();
|
||||
if (params.antiprompt.size() != 0) {
|
||||
// tokenize and inject first reverse prompt
|
||||
|
@ -415,13 +451,16 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
|
||||
// Check if each of the reverse prompts appears at the end of the output.
|
||||
for (std::string antiprompt : params.antiprompt) {
|
||||
for (std::string & antiprompt : params.antiprompt) {
|
||||
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
|
||||
is_interacting = true;
|
||||
set_console_state(CONSOLE_STATE_USER_INPUT);
|
||||
fflush(stdout);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (is_interacting) {
|
||||
|
||||
if (n_past > 0 && is_interacting) {
|
||||
// potentially set color to indicate we are taking user input
|
||||
set_console_state(CONSOLE_STATE_USER_INPUT);
|
||||
|
||||
|
@ -459,13 +498,20 @@ int main(int argc, char ** argv) {
|
|||
|
||||
input_noecho = true; // do not echo this again
|
||||
}
|
||||
is_interacting = false;
|
||||
|
||||
if (n_past > 0) {
|
||||
is_interacting = false;
|
||||
}
|
||||
}
|
||||
|
||||
// end of text token
|
||||
if (embd.back() == llama_token_eos()) {
|
||||
fprintf(stderr, " [end of text]\n");
|
||||
break;
|
||||
if (params.instruct) {
|
||||
is_interacting = true;
|
||||
} else {
|
||||
fprintf(stderr, " [end of text]\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||
|
@ -480,7 +526,6 @@ int main(int argc, char ** argv) {
|
|||
#endif
|
||||
|
||||
llama_print_timings(ctx);
|
||||
|
||||
llama_free(ctx);
|
||||
|
||||
set_console_state(CONSOLE_STATE_DEFAULT);
|
||||
|
|
BIN
main.exe
BIN
main.exe
Binary file not shown.
BIN
quantize.exe
BIN
quantize.exe
Binary file not shown.
21
utils.cpp
21
utils.cpp
|
@ -1,3 +1,5 @@
|
|||
#include "ggml.h"
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
#include <cassert>
|
||||
|
@ -77,8 +79,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.n_ctx = std::stoi(argv[i]);
|
||||
} else if (arg == "--memory_f16") {
|
||||
params.memory_f16 = true;
|
||||
} else if (arg == "--memory_f32") {
|
||||
params.memory_f16 = false;
|
||||
} else if (arg == "--top_p") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -109,6 +111,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.n_batch = std::stoi(argv[i]);
|
||||
params.n_batch = std::min(512, params.n_batch);
|
||||
} else if (arg == "-m" || arg == "--model") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -117,12 +120,20 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
params.model = argv[i];
|
||||
} else if (arg == "-i" || arg == "--interactive") {
|
||||
params.interactive = true;
|
||||
} else if (arg == "--embedding") {
|
||||
params.embedding = true;
|
||||
} else if (arg == "--interactive-start") {
|
||||
params.interactive = true;
|
||||
} else if (arg == "--interactive-first") {
|
||||
params.interactive_start = true;
|
||||
} else if (arg == "-ins" || arg == "--instruct") {
|
||||
params.instruct = true;
|
||||
} else if (arg == "--color") {
|
||||
params.use_color = true;
|
||||
} else if (arg == "--mlock") {
|
||||
params.use_mlock = true;
|
||||
} else if (arg == "--mtest") {
|
||||
params.mem_test = true;
|
||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -185,11 +196,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
|
||||
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
|
||||
fprintf(stderr, " --memory_f16 use f16 instead of f32 for memory key+value\n");
|
||||
fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n");
|
||||
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
|
||||
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
|
||||
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||
if (ggml_mlock_supported()) {
|
||||
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
}
|
||||
fprintf(stderr, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
|
|
21
utils.h
21
utils.h
|
@ -14,12 +14,13 @@
|
|||
//
|
||||
|
||||
struct gpt_params {
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t n_predict = 128; // new tokens to predict
|
||||
int32_t repeat_last_n = 64; // last n tokens to penalize
|
||||
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
||||
int32_t n_ctx = 512; //context size
|
||||
int32_t n_predict = 128; // new tokens to predict
|
||||
int32_t repeat_last_n = 64; // last n tokens to penalize
|
||||
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 8; // batch size for prompt processing
|
||||
|
||||
// sampling parameters
|
||||
int32_t top_k = 40;
|
||||
|
@ -27,21 +28,25 @@ struct gpt_params {
|
|||
float temp = 0.80f;
|
||||
float repeat_penalty = 1.10f;
|
||||
|
||||
int32_t n_batch = 8; // batch size for prompt processing
|
||||
|
||||
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
||||
std::string prompt = "";
|
||||
|
||||
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
|
||||
bool memory_f16 = false; // use f16 instead of f32 for memory kv
|
||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
bool interactive = false; // interactive mode
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool interactive_start = false; // wait for user input immediately
|
||||
|
||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||
bool ignore_eos = false; // do not stop generating after eos
|
||||
bool perplexity = false; // compute perplexity over the prompt
|
||||
bool use_mlock = false; // use mlock to keep model in memory
|
||||
bool mem_test = false; // compute maximum memory usage
|
||||
};
|
||||
|
||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue