This commit is contained in:
mike dupont 2023-11-16 16:16:47 -05:00
parent e7552a4d78
commit e3ae974d3d
3 changed files with 32 additions and 2 deletions

View file

@ -51,7 +51,7 @@ namespace grammar_parser {
}
static bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || c == '_' || ('0' <= c && c <= '9');
}
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {

File diff suppressed because one or more lines are too long

View file

@ -1409,6 +1409,7 @@ static bool llama_kv_cache_init(
ggml_type wtype,
uint32_t n_ctx,
int n_gpu_layers) {
fprintf(stderr, "GPULAYERS '%d'\n", n_gpu_layers);
const uint32_t n_embd = hparams.n_embd_gqa();
const uint32_t n_layer = hparams.n_layer;
@ -1446,6 +1447,7 @@ static bool llama_kv_cache_init(
(void) n_gpu_layers;
#ifdef GGML_USE_CUBLAS
fprintf(stderr, "USE CUBLAS\n");
if (ggml_cublas_loaded()) {
size_t vram_kv_cache = 0;
@ -1463,6 +1465,8 @@ static bool llama_kv_cache_init(
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
}
}
#else
fprintf(stderr, "NO USE CUBLAS\n");
#endif
return true;
@ -1969,6 +1973,7 @@ struct llama_model_loader {
break;
#ifdef GGML_USE_CUBLAS
case GGML_BACKEND_GPU:
case GGML_BACKEND_GPU_SPLIT:
// old code:
//ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
@ -2607,9 +2612,11 @@ static void llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights00 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output_norm);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
fprintf(stderr, "vram_weights01 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output);
}
}
@ -2640,6 +2647,7 @@ static void llm_load_tensors(
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights03 '%ld'\n", vram_weights);
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
@ -2673,9 +2681,11 @@ static void llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights04 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output_norm);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
fprintf(stderr, "vram_weights05 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output);
}
}
@ -2706,6 +2716,7 @@ static void llm_load_tensors(
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights06 '%ld'\n", vram_weights);
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
@ -2744,10 +2755,13 @@ static void llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights07 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output_norm);
fprintf(stderr, "vram_weights08 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output_norm_b);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
fprintf(stderr, "vram_weights09 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output);
}
}
@ -2772,7 +2786,9 @@ static void llm_load_tensors(
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
if (backend == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights10 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(layer.attn_norm_2);
fprintf(stderr, "vram_weights11 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
}
}
@ -2784,6 +2800,7 @@ static void llm_load_tensors(
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights12 '%ld'\n", vram_weights);
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
@ -2821,10 +2838,12 @@ static void llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights13 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output_norm);
vram_weights += ggml_nbytes(model.output_norm_b);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
fprintf(stderr, "vram_weights14 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output);
}
}
@ -2860,6 +2879,7 @@ static void llm_load_tensors(
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
if (backend == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights15 '%ld'\n", vram_weights);
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
@ -2905,10 +2925,13 @@ static void llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights16 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output_norm);
fprintf(stderr, "vram_weights17 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output_norm_b);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
fprintf(stderr, "vram_weights18 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output);
}
}
@ -2971,10 +2994,13 @@ static void llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights19 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output_norm);
fprintf(stderr, "vram_weights20 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output_norm_b);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
fprintf(stderr, "vram_weights21 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output);
}
}
@ -3010,6 +3036,7 @@ static void llm_load_tensors(
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
if (backend == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights22 '%ld'\n", vram_weights);
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
@ -3048,9 +3075,11 @@ static void llm_load_tensors(
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights23 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output_norm);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
fprintf(stderr, "vram_weights24 '%ld'\n", vram_weights);
vram_weights += ggml_nbytes(model.output);
}
}
@ -3077,6 +3106,7 @@ static void llm_load_tensors(
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
fprintf(stderr, "vram_weights25 '%ld'\n", vram_weights);
vram_weights +=
ggml_nbytes(layer.attn_norm) +
ggml_nbytes(layer.wqkv) +