Fix GPT2 not loading due to graph too small
This commit is contained in:
parent
eb42c73953
commit
a6eb9b8010
8 changed files with 21 additions and 19 deletions
|
@ -941,19 +941,20 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
llamamodel->hparams.rope_freq_scale_train!=1.0f ||
|
llamamodel->hparams.rope_freq_scale_train!=1.0f ||
|
||||||
llamamodel->hparams.rope_scaling_type_train==2)
|
llamamodel->hparams.rope_scaling_type_train==2)
|
||||||
{
|
{
|
||||||
// float ropemultiplier = 1.0f;
|
float ropemultiplier = 1.0f;
|
||||||
// if(llamamodel->hparams.rope_scaling_type_train!=2 &&
|
if(llamamodel->hparams.rope_scaling_type_train!=2 &&
|
||||||
// llamamodel->hparams.n_ctx_train > 2048 && clamped_max_context_length > llamamodel->hparams.n_ctx_train)
|
llamamodel->hparams.n_ctx_train > 2048 && clamped_max_context_length > llamamodel->hparams.n_ctx_train &&
|
||||||
// {
|
llamamodel->hparams.rope_freq_scale_train==1.0f)
|
||||||
// ropemultiplier = (float)llamamodel->hparams.n_ctx_train / (float)clamped_max_context_length;
|
{
|
||||||
// llama_ctx_params.rope_freq_base = rope_freq_base = llamamodel->hparams.rope_freq_base_train;
|
ropemultiplier = (float)llamamodel->hparams.n_ctx_train / (float)clamped_max_context_length;
|
||||||
// llama_ctx_params.rope_freq_scale = rope_freq_scale = ropemultiplier * llamamodel->hparams.rope_freq_scale_train;
|
llama_ctx_params.rope_freq_base = rope_freq_base = llamamodel->hparams.rope_freq_base_train;
|
||||||
// printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base);
|
llama_ctx_params.rope_freq_scale = rope_freq_scale = ropemultiplier * llamamodel->hparams.rope_freq_scale_train;
|
||||||
// }
|
printf("Automatic RoPE Scaling: Using (scale:%.3f, base:%.1f).\n", rope_freq_scale, rope_freq_base);
|
||||||
// else
|
}
|
||||||
// {
|
else
|
||||||
|
{
|
||||||
printf("Automatic RoPE Scaling: Using model internal value.\n");
|
printf("Automatic RoPE Scaling: Using model internal value.\n");
|
||||||
//}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
|
@ -455,7 +455,7 @@ bool gpt2_eval(
|
||||||
|
|
||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, 8192, false);
|
||||||
|
|
||||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||||
|
|
|
@ -455,7 +455,7 @@ bool gptj_eval(
|
||||||
|
|
||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false);
|
||||||
|
|
||||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
#include "llama_v3.h"
|
#include "llama_v3.h"
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "otherarch.h"
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#endif
|
#endif
|
||||||
|
@ -88,7 +89,6 @@ enum e_model3 {
|
||||||
|
|
||||||
static const size_t kB3 = 1024;
|
static const size_t kB3 = 1024;
|
||||||
static const size_t MB3 = 1024*1024;
|
static const size_t MB3 = 1024*1024;
|
||||||
static const size_t GGML_MAX_NODES = 8192;
|
|
||||||
|
|
||||||
// computed for n_ctx == 2048
|
// computed for n_ctx == 2048
|
||||||
// TODO: dynamically determine these sizes
|
// TODO: dynamically determine these sizes
|
||||||
|
|
|
@ -390,7 +390,7 @@ bool mpt_eval(const mpt_model & model, const int n_threads, const int n_past,
|
||||||
params.no_alloc = false;
|
params.no_alloc = false;
|
||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false);
|
||||||
|
|
||||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
|
memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
|
||||||
|
|
|
@ -471,7 +471,7 @@ bool gpt_neox_eval(
|
||||||
|
|
||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GGML_MAX_NODES, false);
|
||||||
|
|
||||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||||
|
|
|
@ -459,3 +459,4 @@ struct mpt_model {
|
||||||
};
|
};
|
||||||
|
|
||||||
const float default_norm_eps = 1e-5f;
|
const float default_norm_eps = 1e-5f;
|
||||||
|
const size_t GGML_MAX_NODES = 8192;
|
|
@ -1520,7 +1520,7 @@ struct rwkv_context * rwkv_new_context_impl(std::shared_ptr<struct rwkv_instance
|
||||||
serial_graph.ctx = graph_future_ctx;
|
serial_graph.ctx = graph_future_ctx;
|
||||||
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, serial_graph.ctx.ctx, "Failed to allocate serial graph context");
|
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, serial_graph.ctx.ctx, "Failed to allocate serial graph context");
|
||||||
serial_graph.tokens = ggml_new_i32(serial_graph.ctx.ctx, 0);
|
serial_graph.tokens = ggml_new_i32(serial_graph.ctx.ctx, 0);
|
||||||
serial_graph.cgraph = ggml_new_graph(serial_graph.ctx.ctx);
|
serial_graph.cgraph = ggml_new_graph_custom(serial_graph.ctx.ctx, GGML_MAX_NODES, false);
|
||||||
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, serial_graph.cgraph, "Failed to allocate serial graph");
|
RWKV_ASSERT_NULL_MSG(RWKV_ERROR_ALLOC, serial_graph.cgraph, "Failed to allocate serial graph");
|
||||||
|
|
||||||
RWKV_ASSERT_NULL(RWKV_ERROR_GRAPH, rwkv_build_serial_graph(
|
RWKV_ASSERT_NULL(RWKV_ERROR_GRAPH, rwkv_build_serial_graph(
|
||||||
|
@ -1698,7 +1698,7 @@ bool rwkv_eval_sequence(struct rwkv_context * ctx, const int n_threads, const ui
|
||||||
sequence_graph.ctx = graph_future_ctx;
|
sequence_graph.ctx = graph_future_ctx;
|
||||||
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, sequence_graph.ctx.ctx, "Failed to allocate sequence graph context");
|
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_CTX | RWKV_ERROR_ALLOC, sequence_graph.ctx.ctx, "Failed to allocate sequence graph context");
|
||||||
sequence_graph.tokens = ggml_new_tensor_1d(sequence_graph.ctx.ctx, GGML_TYPE_I32, sequence_len);
|
sequence_graph.tokens = ggml_new_tensor_1d(sequence_graph.ctx.ctx, GGML_TYPE_I32, sequence_len);
|
||||||
sequence_graph.cgraph = ggml_new_graph(sequence_graph.ctx.ctx);
|
sequence_graph.cgraph = ggml_new_graph_custom(sequence_graph.ctx.ctx, GGML_MAX_NODES, false);
|
||||||
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, sequence_graph.cgraph, "Failed to allocate sequence graph");
|
RWKV_ASSERT_FALSE_MSG(RWKV_ERROR_ALLOC, sequence_graph.cgraph, "Failed to allocate sequence graph");
|
||||||
|
|
||||||
RWKV_ASSERT_FALSE(RWKV_ERROR_GRAPH, rwkv_build_sequence_graph(
|
RWKV_ASSERT_FALSE(RWKV_ERROR_GRAPH, rwkv_build_sequence_graph(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue