grammars
This commit is contained in:
parent
e7552a4d78
commit
e3ae974d3d
3 changed files with 32 additions and 2 deletions
|
@ -51,7 +51,7 @@ namespace grammar_parser {
|
|||
}
|
||||
|
||||
static bool is_word_char(char c) {
|
||||
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
|
||||
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || c == '_' || ('0' <= c && c <= '9');
|
||||
}
|
||||
|
||||
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
|
||||
|
|
File diff suppressed because one or more lines are too long
30
llama.cpp
30
llama.cpp
|
@ -1409,6 +1409,7 @@ static bool llama_kv_cache_init(
|
|||
ggml_type wtype,
|
||||
uint32_t n_ctx,
|
||||
int n_gpu_layers) {
|
||||
fprintf(stderr, "GPULAYERS '%d'\n", n_gpu_layers);
|
||||
const uint32_t n_embd = hparams.n_embd_gqa();
|
||||
const uint32_t n_layer = hparams.n_layer;
|
||||
|
||||
|
@ -1446,6 +1447,7 @@ static bool llama_kv_cache_init(
|
|||
(void) n_gpu_layers;
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "USE CUBLAS\n");
|
||||
if (ggml_cublas_loaded()) {
|
||||
size_t vram_kv_cache = 0;
|
||||
|
||||
|
@ -1463,6 +1465,8 @@ static bool llama_kv_cache_init(
|
|||
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
||||
}
|
||||
}
|
||||
#else
|
||||
fprintf(stderr, "NO USE CUBLAS\n");
|
||||
#endif
|
||||
|
||||
return true;
|
||||
|
@ -1969,6 +1973,7 @@ struct llama_model_loader {
|
|||
break;
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
case GGML_BACKEND_GPU:
|
||||
|
||||
case GGML_BACKEND_GPU_SPLIT:
|
||||
// old code:
|
||||
//ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
|
||||
|
@ -2607,9 +2612,11 @@ static void llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
|
||||
if (backend_norm == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights00 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output_norm);
|
||||
}
|
||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||
fprintf(stderr, "vram_weights01 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
@ -2640,6 +2647,7 @@ static void llm_load_tensors(
|
|||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights03 '%ld'\n", vram_weights);
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
||||
|
@ -2673,9 +2681,11 @@ static void llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
|
||||
if (backend_norm == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights04 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output_norm);
|
||||
}
|
||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||
fprintf(stderr, "vram_weights05 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
@ -2706,6 +2716,7 @@ static void llm_load_tensors(
|
|||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights06 '%ld'\n", vram_weights);
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
||||
|
@ -2744,10 +2755,13 @@ static void llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
|
||||
if (backend_norm == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights07 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output_norm);
|
||||
fprintf(stderr, "vram_weights08 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||
}
|
||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||
fprintf(stderr, "vram_weights09 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
@ -2772,7 +2786,9 @@ static void llm_load_tensors(
|
|||
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights10 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(layer.attn_norm_2);
|
||||
fprintf(stderr, "vram_weights11 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
|
||||
}
|
||||
}
|
||||
|
@ -2784,6 +2800,7 @@ static void llm_load_tensors(
|
|||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights12 '%ld'\n", vram_weights);
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
||||
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
|
||||
|
@ -2821,10 +2838,12 @@ static void llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
|
||||
if (backend_norm == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights13 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output_norm);
|
||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||
}
|
||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||
fprintf(stderr, "vram_weights14 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
@ -2860,6 +2879,7 @@ static void llm_load_tensors(
|
|||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights15 '%ld'\n", vram_weights);
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
||||
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
||||
|
@ -2905,10 +2925,13 @@ static void llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
|
||||
if (backend_norm == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights16 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output_norm);
|
||||
fprintf(stderr, "vram_weights17 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||
}
|
||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||
fprintf(stderr, "vram_weights18 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
@ -2971,10 +2994,13 @@ static void llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
|
||||
if (backend_norm == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights19 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output_norm);
|
||||
fprintf(stderr, "vram_weights20 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||
}
|
||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||
fprintf(stderr, "vram_weights21 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
@ -3010,6 +3036,7 @@ static void llm_load_tensors(
|
|||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights22 '%ld'\n", vram_weights);
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
||||
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
||||
|
@ -3048,9 +3075,11 @@ static void llm_load_tensors(
|
|||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
|
||||
if (backend_norm == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights23 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output_norm);
|
||||
}
|
||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||
fprintf(stderr, "vram_weights24 '%ld'\n", vram_weights);
|
||||
vram_weights += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
@ -3077,6 +3106,7 @@ static void llm_load_tensors(
|
|||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
fprintf(stderr, "vram_weights25 '%ld'\n", vram_weights);
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attn_norm) +
|
||||
ggml_nbytes(layer.wqkv) +
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue