just testing cublas

This commit is contained in:
Concedo 2023-05-15 20:01:22 +08:00
parent fce2e7e518
commit 6504150fac
9 changed files with 53 additions and 59 deletions

View file

@ -17,7 +17,7 @@ set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Release")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(LLAMA_STANDALONE ON) set(LLAMA_STANDALONE ON)
set(BUILD_SHARED_LIBS_DEFAULT ON) set(BUILD_SHARED_LIBS_DEFAULT ON)
set(LLAMA_STATIC ON) set(LLAMA_STATIC OFF)
set(LLAMA_NATIVE OFF) set(LLAMA_NATIVE OFF)
set(LLAMA_LTO OFF) set(LLAMA_LTO OFF)
set(LLAMA_ALL_WARNINGS OFF) set(LLAMA_ALL_WARNINGS OFF)

View file

@ -10053,10 +10053,10 @@ enum ggml_v1_opt_result ggml_v1_opt(
struct ggml_v1_tensor * f) { struct ggml_v1_tensor * f) {
bool free_ctx = false; bool free_ctx = false;
if (ctx == NULL) { if (ctx == NULL) {
struct ggml_v1_init_params params_ctx = { struct ggml_v1_init_params params_ctx;
.mem_size = 16*1024*1024, params_ctx.mem_size = 16*1024*1024;
.mem_buffer = NULL, params_ctx.mem_buffer = NULL;
};
ctx = ggml_v1_init(params_ctx); ctx = ggml_v1_init(params_ctx);
if (ctx == NULL) { if (ctx == NULL) {

View file

@ -12,7 +12,6 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <iostream> #include <iostream>
#include <unistd.h>
@ -137,10 +136,10 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
// create the ggml context // create the ggml context
{ {
struct ggml_v1_init_params params = { struct ggml_v1_init_params params;
.mem_size = ctx_size, params.mem_size = ctx_size,
.mem_buffer = NULL, params.mem_buffer = NULL,
};
model.ctx = ggml_v1_init(params); model.ctx = ggml_v1_init(params);
if (!model.ctx) { if (!model.ctx) {
@ -352,10 +351,10 @@ bool legacy_gpt2_eval(
} }
} }
struct ggml_v1_init_params params = { struct ggml_v1_init_params params;
.mem_size = buf_size, params.mem_size = buf_size;
.mem_buffer = buf, params.mem_buffer = buf;
};
struct ggml_v1_context * ctx0 = ggml_v1_init(params); struct ggml_v1_context * ctx0 = ggml_v1_init(params);
struct ggml_v1_cgraph gf = { .n_threads = n_threads }; struct ggml_v1_cgraph gf = { .n_threads = n_threads };

View file

@ -13,7 +13,6 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <iostream> #include <iostream>
#include <unistd.h>
#include "model_adapter.h" #include "model_adapter.h"
@ -143,11 +142,11 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
// create the ggml context // create the ggml context
{ {
struct ggml_init_params params = { struct ggml_init_params params;
.mem_size = ctx_size, params.mem_size = ctx_size;
.mem_buffer = NULL, params.mem_buffer = NULL;
.no_alloc = false, params.no_alloc = false;
};
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
if (!model.ctx) { if (!model.ctx) {
@ -370,11 +369,11 @@ bool gpt2_eval(
} }
} }
struct ggml_init_params params = { struct ggml_init_params params;
.mem_size = buf_size, params.mem_size = buf_size;
.mem_buffer = buf, params.mem_buffer = buf;
.no_alloc = false, params.no_alloc = false;
};
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph gf = { .n_threads = n_threads }; struct ggml_cgraph gf = { .n_threads = n_threads };

View file

@ -12,7 +12,6 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <iostream> #include <iostream>
#include <unistd.h>
@ -148,10 +147,10 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
// create the ggml context // create the ggml context
{ {
struct ggml_v1_init_params params = { struct ggml_v1_init_params params;
.mem_size = ctx_size, params.mem_size = ctx_size;
.mem_buffer = NULL, params.mem_buffer = NULL;
};
model.ctx = ggml_v1_init(params); model.ctx = ggml_v1_init(params);
if (!model.ctx) { if (!model.ctx) {
@ -402,10 +401,10 @@ bool legacy_gptj_eval(
} }
} }
struct ggml_v1_init_params params = { struct ggml_v1_init_params params;
.mem_size = buf_size, params.mem_size = buf_size;
.mem_buffer = buf, params.mem_buffer = buf;
};
struct ggml_v1_context * ctx0 = ggml_v1_init(params); struct ggml_v1_context * ctx0 = ggml_v1_init(params);
struct ggml_v1_cgraph gf = { .n_threads = n_threads }; struct ggml_v1_cgraph gf = { .n_threads = n_threads };

View file

@ -13,7 +13,6 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <iostream> #include <iostream>
#include <unistd.h>
#include "model_adapter.h" #include "model_adapter.h"
@ -143,11 +142,11 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
// create the ggml context // create the ggml context
{ {
struct ggml_init_params params = { struct ggml_init_params params;
.mem_size = ctx_size, params.mem_size = ctx_size;
.mem_buffer = NULL, params.mem_buffer = NULL;
.no_alloc = false, params.no_alloc = false;
};
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
if (!model.ctx) { if (!model.ctx) {
@ -382,11 +381,11 @@ bool gptj_eval(
} }
} }
struct ggml_init_params params = { struct ggml_init_params params;
.mem_size = buf_size, params.mem_size = buf_size;
.mem_buffer = buf, params.mem_buffer = buf;
.no_alloc = false, params.no_alloc = false;
};
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph gf = { .n_threads = n_threads }; struct ggml_cgraph gf = { .n_threads = n_threads };

View file

@ -13,7 +13,6 @@
#include <string> #include <string>
#include <vector> #include <vector>
#include <iostream> #include <iostream>
#include <unistd.h>
@ -135,11 +134,10 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model &
// create the ggml context // create the ggml context
{ {
struct ggml_init_params params = { struct ggml_init_params params;
.mem_size = ctx_size, params.mem_size = ctx_size;
.mem_buffer = NULL, params.mem_buffer = NULL;
.no_alloc = false, params.no_alloc = false;
};
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
if (!model.ctx) { if (!model.ctx) {
@ -377,11 +375,11 @@ bool stablelm_eval(
} }
} }
struct ggml_init_params params = { struct ggml_init_params params;
.mem_size = buf_size, params.mem_size = buf_size;
.mem_buffer = buf, params.mem_buffer = buf;
.no_alloc = false, params.no_alloc = false;
};
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph gf = { .n_threads = n_threads }; struct ggml_cgraph gf = { .n_threads = n_threads };

View file

@ -10,7 +10,7 @@ import torch
import numpy as np import numpy as np
import re import re
from transformers import GPTJForCausalLM, AutoModelForCausalLM from transformers import AutoModelForCausalLM
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode(): def bytes_to_unicode():

View file

@ -1,7 +1,6 @@
import sys import sys
import struct import struct
import json import json
import torch
import numpy as np import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoModelForCausalLM, AutoTokenizer
@ -59,6 +58,7 @@ fout.write(struct.pack("i", hparams["hidden_size"]))
fout.write(struct.pack("i", hparams["num_attention_heads"])) fout.write(struct.pack("i", hparams["num_attention_heads"]))
fout.write(struct.pack("i", hparams["num_hidden_layers"])) fout.write(struct.pack("i", hparams["num_hidden_layers"]))
fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"])))) fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))))
fout.write(struct.pack("i", hparams["use_parallel_residual"]))
fout.write(struct.pack("i", ftype)) fout.write(struct.pack("i", ftype))
# TODO: temporary hack to not deal with implementing the tokenizer # TODO: temporary hack to not deal with implementing the tokenizer