Merge branch 'master' into concedo
# Conflicts: # .devops/full.Dockerfile # README.md
This commit is contained in:
commit
06c711d770
8 changed files with 433 additions and 374 deletions
52
ggml.h
52
ggml.h
|
@ -258,11 +258,11 @@ struct ggml_tensor {
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
|
|
||||||
int n_dims;
|
int n_dims;
|
||||||
int ne[GGML_MAX_DIMS]; // number of elements
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||||
// nb[0] = sizeof(type)
|
// nb[0] = sizeof(type)
|
||||||
// nb[1] = nb[0] * ne[0] + padding
|
// nb[1] = nb[0] * ne[0] + padding
|
||||||
// nb[i] = nb[i-1] * ne[i-1]
|
// nb[i] = nb[i-1] * ne[i-1]
|
||||||
|
|
||||||
// compute data
|
// compute data
|
||||||
enum ggml_op op;
|
enum ggml_op op;
|
||||||
|
@ -328,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
|
||||||
void ggml_print_object (const struct ggml_object * obj);
|
void ggml_print_object (const struct ggml_object * obj);
|
||||||
void ggml_print_objects(const struct ggml_context * ctx);
|
void ggml_print_objects(const struct ggml_context * ctx);
|
||||||
|
|
||||||
int ggml_nelements(const struct ggml_tensor * tensor);
|
int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
||||||
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
int ggml_blck_size (enum ggml_type type);
|
int ggml_blck_size (enum ggml_type type);
|
||||||
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||||
|
@ -355,33 +355,33 @@ struct ggml_tensor * ggml_new_tensor(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int n_dims,
|
int n_dims,
|
||||||
const int *ne);
|
const int64_t *ne);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor_1d(
|
struct ggml_tensor * ggml_new_tensor_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int ne0);
|
int64_t ne0);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor_2d(
|
struct ggml_tensor * ggml_new_tensor_2d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1);
|
int64_t ne1);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor_3d(
|
struct ggml_tensor * ggml_new_tensor_3d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1,
|
int64_t ne1,
|
||||||
int ne2);
|
int64_t ne2);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor_4d(
|
struct ggml_tensor * ggml_new_tensor_4d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1,
|
int64_t ne1,
|
||||||
int ne2,
|
int64_t ne2,
|
||||||
int ne3);
|
int64_t ne3);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||||
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||||
|
@ -531,30 +531,30 @@ struct ggml_tensor * ggml_reshape(
|
||||||
struct ggml_tensor * ggml_reshape_2d(
|
struct ggml_tensor * ggml_reshape_2d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1);
|
int64_t ne1);
|
||||||
|
|
||||||
// return view(a)
|
// return view(a)
|
||||||
// TODO: when we start computing gradient, make a copy instead of view
|
// TODO: when we start computing gradient, make a copy instead of view
|
||||||
struct ggml_tensor * ggml_reshape_3d(
|
struct ggml_tensor * ggml_reshape_3d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1,
|
int64_t ne1,
|
||||||
int ne2);
|
int64_t ne2);
|
||||||
|
|
||||||
// offset in bytes
|
// offset in bytes
|
||||||
struct ggml_tensor * ggml_view_1d(
|
struct ggml_tensor * ggml_view_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
size_t offset);
|
size_t offset);
|
||||||
|
|
||||||
struct ggml_tensor * ggml_view_2d(
|
struct ggml_tensor * ggml_view_2d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int ne0,
|
int64_t ne0,
|
||||||
int ne1,
|
int64_t ne1,
|
||||||
size_t nb1, // row stride in bytes
|
size_t nb1, // row stride in bytes
|
||||||
size_t offset);
|
size_t offset);
|
||||||
|
|
||||||
|
|
BIN
koboldcpp.dll
BIN
koboldcpp.dll
Binary file not shown.
Binary file not shown.
47
llama.cpp
47
llama.cpp
|
@ -256,8 +256,8 @@ static bool kv_cache_init(
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
|
|
||||||
const int n_mem = n_layer*n_ctx;
|
const int64_t n_mem = (int64_t)n_layer*n_ctx;
|
||||||
const int n_elements = n_embd*n_mem;
|
const int64_t n_elements = n_embd*n_mem;
|
||||||
|
|
||||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
||||||
|
|
||||||
|
@ -679,7 +679,7 @@ static bool llama_model_load(
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
||||||
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1194,6 +1194,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||||
const auto & logits = lctx.logits;
|
const auto & logits = lctx.logits;
|
||||||
const auto * plogits = logits.data() + logits.size() - n_logits;
|
const auto * plogits = logits.data() + logits.size() - n_logits;
|
||||||
|
|
||||||
|
if (temp <= 0) {
|
||||||
|
// select the token with the highest logit directly
|
||||||
|
float max_logit = plogits[0];
|
||||||
|
llama_vocab::id max_id = 0;
|
||||||
|
|
||||||
|
for (int i = 1; i < n_logits; ++i) {
|
||||||
|
if (plogits[i] > max_logit) {
|
||||||
|
max_logit = plogits[i];
|
||||||
|
max_id = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return max_id;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::pair<float, llama_vocab::id>> logits_id;
|
std::vector<std::pair<float, llama_vocab::id>> logits_id;
|
||||||
logits_id.reserve(n_logits);
|
logits_id.reserve(n_logits);
|
||||||
|
|
||||||
|
@ -1668,6 +1682,33 @@ int llama_model_quantize(
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns the KV cache that will contain the context for the
|
||||||
|
// ongoing prediction with the model.
|
||||||
|
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
||||||
|
return ctx->model.kv_self.buf.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Returns the size of the KV cache
|
||||||
|
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
|
||||||
|
return ctx->model.kv_self.buf.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
||||||
|
return ctx->model.kv_self.n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sets the KV cache containing the current context for the model
|
||||||
|
void llama_set_kv_cache(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const uint8_t * kv_cache,
|
||||||
|
size_t n_size,
|
||||||
|
int n_token_count) {
|
||||||
|
// Make sure we have the same kv cache setup
|
||||||
|
LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
|
||||||
|
memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
|
||||||
|
ctx->model.kv_self.n = n_token_count;
|
||||||
|
}
|
||||||
|
|
||||||
int llama_eval(
|
int llama_eval(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
|
|
17
llama.h
17
llama.h
|
@ -83,6 +83,23 @@ extern "C" {
|
||||||
const char * fname_out,
|
const char * fname_out,
|
||||||
int itype);
|
int itype);
|
||||||
|
|
||||||
|
// Returns the KV cache that will contain the context for the
|
||||||
|
// ongoing prediction with the model.
|
||||||
|
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Returns the size of the KV cache
|
||||||
|
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Returns the number of tokens in the KV cache
|
||||||
|
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Sets the KV cache containing the current context for the model
|
||||||
|
LLAMA_API void llama_set_kv_cache(
|
||||||
|
struct llama_context * ctx,
|
||||||
|
const uint8_t * kv_cache,
|
||||||
|
size_t n_size,
|
||||||
|
int n_token_count);
|
||||||
|
|
||||||
// Run the llama inference to obtain the logits and probabilities for the next token.
|
// Run the llama inference to obtain the logits and probabilities for the next token.
|
||||||
// tokens + n_tokens is the provided batch of new tokens to process
|
// tokens + n_tokens is the provided batch of new tokens to process
|
||||||
// n_past is the number of tokens to use from previous eval calls
|
// n_past is the number of tokens to use from previous eval calls
|
||||||
|
|
BIN
main.exe
BIN
main.exe
Binary file not shown.
BIN
quantize.exe
BIN
quantize.exe
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue