Merge branch 'master' into concedo
# Conflicts: # .devops/full.Dockerfile # README.md
This commit is contained in:
commit
06c711d770
8 changed files with 433 additions and 374 deletions
52
ggml.h
52
ggml.h
|
@ -258,11 +258,11 @@ struct ggml_tensor {
|
|||
enum ggml_type type;
|
||||
|
||||
int n_dims;
|
||||
int ne[GGML_MAX_DIMS]; // number of elements
|
||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||
// nb[0] = sizeof(type)
|
||||
// nb[1] = nb[0] * ne[0] + padding
|
||||
// nb[i] = nb[i-1] * ne[i-1]
|
||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||
// nb[0] = sizeof(type)
|
||||
// nb[1] = nb[0] * ne[0] + padding
|
||||
// nb[i] = nb[i-1] * ne[i-1]
|
||||
|
||||
// compute data
|
||||
enum ggml_op op;
|
||||
|
@ -328,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
|
|||
void ggml_print_object (const struct ggml_object * obj);
|
||||
void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
int ggml_nelements(const struct ggml_tensor * tensor);
|
||||
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
||||
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
|
||||
int ggml_blck_size (enum ggml_type type);
|
||||
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
|
@ -355,33 +355,33 @@ struct ggml_tensor * ggml_new_tensor(
|
|||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int n_dims,
|
||||
const int *ne);
|
||||
const int64_t *ne);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_1d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int ne0);
|
||||
int64_t ne0);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_2d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int ne0,
|
||||
int ne1);
|
||||
int64_t ne0,
|
||||
int64_t ne1);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_3d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2);
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_4d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2,
|
||||
int ne3);
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
|
||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
|
@ -531,30 +531,30 @@ struct ggml_tensor * ggml_reshape(
|
|||
struct ggml_tensor * ggml_reshape_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1);
|
||||
int64_t ne0,
|
||||
int64_t ne1);
|
||||
|
||||
// return view(a)
|
||||
// TODO: when we start computing gradient, make a copy instead of view
|
||||
struct ggml_tensor * ggml_reshape_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int ne2);
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2);
|
||||
|
||||
// offset in bytes
|
||||
struct ggml_tensor * ggml_view_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int64_t ne0,
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_view_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int ne0,
|
||||
int ne1,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
size_t nb1, // row stride in bytes
|
||||
size_t offset);
|
||||
|
||||
|
|
BIN
koboldcpp.dll
BIN
koboldcpp.dll
Binary file not shown.
Binary file not shown.
47
llama.cpp
47
llama.cpp
|
@ -256,8 +256,8 @@ static bool kv_cache_init(
|
|||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
|
||||
const int n_mem = n_layer*n_ctx;
|
||||
const int n_elements = n_embd*n_mem;
|
||||
const int64_t n_mem = (int64_t)n_layer*n_ctx;
|
||||
const int64_t n_elements = n_embd*n_mem;
|
||||
|
||||
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
|
||||
|
||||
|
@ -679,7 +679,7 @@ static bool llama_model_load(
|
|||
return false;
|
||||
}
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
||||
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
||||
return false;
|
||||
}
|
||||
|
@ -1194,6 +1194,20 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
|||
const auto & logits = lctx.logits;
|
||||
const auto * plogits = logits.data() + logits.size() - n_logits;
|
||||
|
||||
if (temp <= 0) {
|
||||
// select the token with the highest logit directly
|
||||
float max_logit = plogits[0];
|
||||
llama_vocab::id max_id = 0;
|
||||
|
||||
for (int i = 1; i < n_logits; ++i) {
|
||||
if (plogits[i] > max_logit) {
|
||||
max_logit = plogits[i];
|
||||
max_id = i;
|
||||
}
|
||||
}
|
||||
return max_id;
|
||||
}
|
||||
|
||||
std::vector<std::pair<float, llama_vocab::id>> logits_id;
|
||||
logits_id.reserve(n_logits);
|
||||
|
||||
|
@ -1668,6 +1682,33 @@ int llama_model_quantize(
|
|||
return 0;
|
||||
}
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.buf.data();
|
||||
}
|
||||
|
||||
// Returns the size of the KV cache
|
||||
size_t llama_get_kv_cache_size(struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.buf.size();
|
||||
}
|
||||
|
||||
int llama_get_kv_cache_token_count(struct llama_context * ctx) {
|
||||
return ctx->model.kv_self.n;
|
||||
}
|
||||
|
||||
// Sets the KV cache containing the current context for the model
|
||||
void llama_set_kv_cache(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * kv_cache,
|
||||
size_t n_size,
|
||||
int n_token_count) {
|
||||
// Make sure we have the same kv cache setup
|
||||
LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
|
||||
memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
|
||||
ctx->model.kv_self.n = n_token_count;
|
||||
}
|
||||
|
||||
int llama_eval(
|
||||
struct llama_context * ctx,
|
||||
const llama_token * tokens,
|
||||
|
|
17
llama.h
17
llama.h
|
@ -83,6 +83,23 @@ extern "C" {
|
|||
const char * fname_out,
|
||||
int itype);
|
||||
|
||||
// Returns the KV cache that will contain the context for the
|
||||
// ongoing prediction with the model.
|
||||
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
|
||||
|
||||
// Returns the size of the KV cache
|
||||
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
|
||||
|
||||
// Returns the number of tokens in the KV cache
|
||||
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
|
||||
|
||||
// Sets the KV cache containing the current context for the model
|
||||
LLAMA_API void llama_set_kv_cache(
|
||||
struct llama_context * ctx,
|
||||
const uint8_t * kv_cache,
|
||||
size_t n_size,
|
||||
int n_token_count);
|
||||
|
||||
// Run the llama inference to obtain the logits and probabilities for the next token.
|
||||
// tokens + n_tokens is the provided batch of new tokens to process
|
||||
// n_past is the number of tokens to use from previous eval calls
|
||||
|
|
BIN
main.exe
BIN
main.exe
Binary file not shown.
BIN
quantize.exe
BIN
quantize.exe
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue