just testing cublas

This commit is contained in:
Concedo 2023-05-15 20:01:22 +08:00
parent fce2e7e518
commit 6504150fac
9 changed files with 53 additions and 59 deletions

View file

@ -17,7 +17,7 @@ set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Release")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(LLAMA_STANDALONE ON)
set(BUILD_SHARED_LIBS_DEFAULT ON)
set(LLAMA_STATIC ON)
set(LLAMA_STATIC OFF)
set(LLAMA_NATIVE OFF)
set(LLAMA_LTO OFF)
set(LLAMA_ALL_WARNINGS OFF)

View file

@ -10053,10 +10053,10 @@ enum ggml_v1_opt_result ggml_v1_opt(
struct ggml_v1_tensor * f) {
bool free_ctx = false;
if (ctx == NULL) {
struct ggml_v1_init_params params_ctx = {
.mem_size = 16*1024*1024,
.mem_buffer = NULL,
};
struct ggml_v1_init_params params_ctx;
params_ctx.mem_size = 16*1024*1024;
params_ctx.mem_buffer = NULL;
ctx = ggml_v1_init(params_ctx);
if (ctx == NULL) {

View file

@ -12,7 +12,6 @@
#include <string>
#include <vector>
#include <iostream>
#include <unistd.h>
@ -137,10 +136,10 @@ ModelLoadResult legacy_gpt2_model_load(const std::string & fname, gpt2_v1_model
// create the ggml context
{
struct ggml_v1_init_params params = {
.mem_size = ctx_size,
.mem_buffer = NULL,
};
struct ggml_v1_init_params params;
params.mem_size = ctx_size,
params.mem_buffer = NULL,
model.ctx = ggml_v1_init(params);
if (!model.ctx) {
@ -352,10 +351,10 @@ bool legacy_gpt2_eval(
}
}
struct ggml_v1_init_params params = {
.mem_size = buf_size,
.mem_buffer = buf,
};
struct ggml_v1_init_params params;
params.mem_size = buf_size;
params.mem_buffer = buf;
struct ggml_v1_context * ctx0 = ggml_v1_init(params);
struct ggml_v1_cgraph gf = { .n_threads = n_threads };

View file

@ -13,7 +13,6 @@
#include <string>
#include <vector>
#include <iostream>
#include <unistd.h>
#include "model_adapter.h"
@ -143,11 +142,11 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
// create the ggml context
{
struct ggml_init_params params = {
.mem_size = ctx_size,
.mem_buffer = NULL,
.no_alloc = false,
};
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = false;
model.ctx = ggml_init(params);
if (!model.ctx) {
@ -370,11 +369,11 @@ bool gpt2_eval(
}
}
struct ggml_init_params params = {
.mem_size = buf_size,
.mem_buffer = buf,
.no_alloc = false,
};
struct ggml_init_params params;
params.mem_size = buf_size;
params.mem_buffer = buf;
params.no_alloc = false;
struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph gf = { .n_threads = n_threads };

View file

@ -12,7 +12,6 @@
#include <string>
#include <vector>
#include <iostream>
#include <unistd.h>
@ -148,10 +147,10 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
// create the ggml context
{
struct ggml_v1_init_params params = {
.mem_size = ctx_size,
.mem_buffer = NULL,
};
struct ggml_v1_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
model.ctx = ggml_v1_init(params);
if (!model.ctx) {
@ -402,10 +401,10 @@ bool legacy_gptj_eval(
}
}
struct ggml_v1_init_params params = {
.mem_size = buf_size,
.mem_buffer = buf,
};
struct ggml_v1_init_params params;
params.mem_size = buf_size;
params.mem_buffer = buf;
struct ggml_v1_context * ctx0 = ggml_v1_init(params);
struct ggml_v1_cgraph gf = { .n_threads = n_threads };

View file

@ -13,7 +13,6 @@
#include <string>
#include <vector>
#include <iostream>
#include <unistd.h>
#include "model_adapter.h"
@ -143,11 +142,11 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
// create the ggml context
{
struct ggml_init_params params = {
.mem_size = ctx_size,
.mem_buffer = NULL,
.no_alloc = false,
};
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = false;
model.ctx = ggml_init(params);
if (!model.ctx) {
@ -382,11 +381,11 @@ bool gptj_eval(
}
}
struct ggml_init_params params = {
.mem_size = buf_size,
.mem_buffer = buf,
.no_alloc = false,
};
struct ggml_init_params params;
params.mem_size = buf_size;
params.mem_buffer = buf;
params.no_alloc = false;
struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph gf = { .n_threads = n_threads };

View file

@ -13,7 +13,6 @@
#include <string>
#include <vector>
#include <iostream>
#include <unistd.h>
@ -135,11 +134,10 @@ ModelLoadResult stablelm_model_load(const std::string & fname, stablelm_model &
// create the ggml context
{
struct ggml_init_params params = {
.mem_size = ctx_size,
.mem_buffer = NULL,
.no_alloc = false,
};
struct ggml_init_params params;
params.mem_size = ctx_size;
params.mem_buffer = NULL;
params.no_alloc = false;
model.ctx = ggml_init(params);
if (!model.ctx) {
@ -377,11 +375,11 @@ bool stablelm_eval(
}
}
struct ggml_init_params params = {
.mem_size = buf_size,
.mem_buffer = buf,
.no_alloc = false,
};
struct ggml_init_params params;
params.mem_size = buf_size;
params.mem_buffer = buf;
params.no_alloc = false;
struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph gf = { .n_threads = n_threads };

View file

@ -10,7 +10,7 @@ import torch
import numpy as np
import re
from transformers import GPTJForCausalLM, AutoModelForCausalLM
from transformers import AutoModelForCausalLM
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():

View file

@ -1,7 +1,6 @@
import sys
import struct
import json
import torch
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
@ -59,6 +58,7 @@ fout.write(struct.pack("i", hparams["hidden_size"]))
fout.write(struct.pack("i", hparams["num_attention_heads"]))
fout.write(struct.pack("i", hparams["num_hidden_layers"]))
fout.write(struct.pack("i", int(hparams["rotary_pct"]*(hparams["hidden_size"]//hparams["num_attention_heads"]))))
fout.write(struct.pack("i", hparams["use_parallel_residual"]))
fout.write(struct.pack("i", ftype))
# TODO: temporary hack to not deal with implementing the tokenizer