Merge branch 'ggerganov:master' into load-parallel-prompt-file
This commit is contained in:
commit
e293ebd68e
9 changed files with 331 additions and 196 deletions
|
@ -1,4 +1,4 @@
|
||||||
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
|
cmake_minimum_required(VERSION 3.13) # for add_link_options
|
||||||
project("llama.cpp" C CXX)
|
project("llama.cpp" C CXX)
|
||||||
|
|
||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
@ -343,8 +343,9 @@ if (LLAMA_MPI)
|
||||||
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
||||||
add_compile_definitions(GGML_USE_MPI)
|
add_compile_definitions(GGML_USE_MPI)
|
||||||
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
||||||
set(cxx_flags ${cxx_flags} -Wno-cast-qual)
|
if (NOT MSVC)
|
||||||
set(c_flags ${c_flags} -Wno-cast-qual)
|
add_compile_options(-Wno-cast-qual)
|
||||||
|
endif()
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
|
||||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
|
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
|
||||||
# Even if you're only using the C header, C++ programs may bring in MPI
|
# Even if you're only using the C header, C++ programs may bring in MPI
|
||||||
|
@ -418,10 +419,11 @@ if (LLAMA_ALL_WARNINGS)
|
||||||
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
|
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int
|
||||||
-Werror=implicit-function-declaration)
|
-Werror=implicit-function-declaration)
|
||||||
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
|
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
|
||||||
|
set(host_cxx_flags "")
|
||||||
|
|
||||||
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
|
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||||
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
|
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
|
||||||
set(cxx_flags ${cxx_flags} -Wmissing-prototypes -Wextra-semi)
|
set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
|
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
|
||||||
|
@ -431,27 +433,38 @@ if (LLAMA_ALL_WARNINGS)
|
||||||
endif()
|
endif()
|
||||||
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
||||||
set(c_flags ${c_flags} -Wdouble-promotion)
|
set(c_flags ${c_flags} -Wdouble-promotion)
|
||||||
set(cxx_flags ${cxx_flags} -Wno-array-bounds)
|
set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
|
||||||
|
|
||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
|
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
|
||||||
set(cxx_flags ${cxx_flags} -Wno-format-truncation)
|
set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
|
||||||
endif()
|
endif()
|
||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
|
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
|
||||||
set(cxx_flags ${cxx_flags} -Wextra-semi)
|
set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
# todo : msvc
|
# todo : msvc
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_compile_options(
|
set(c_flags ${c_flags} ${warning_flags})
|
||||||
${warning_flags}
|
set(cxx_flags ${cxx_flags} ${warning_flags})
|
||||||
"$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
||||||
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
|
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags} ${host_cxx_flags}>")
|
||||||
)
|
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (NOT MSVC)
|
||||||
|
set(cuda_flags -Wno-pedantic)
|
||||||
|
endif()
|
||||||
|
set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
|
||||||
|
|
||||||
|
list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument
|
||||||
|
if (NOT cuda_host_flags STREQUAL "")
|
||||||
|
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
|
||||||
|
|
||||||
|
|
|
@ -41,8 +41,7 @@ if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
|
||||||
|
|
||||||
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
||||||
|
|
||||||
ARCH=gguf.MODEL_ARCH.LLAMA
|
ARCH = gguf.MODEL_ARCH.LLAMA
|
||||||
NAMES=gguf.MODEL_TENSOR_NAMES[ARCH]
|
|
||||||
|
|
||||||
DEFAULT_CONCURRENCY = 8
|
DEFAULT_CONCURRENCY = 8
|
||||||
#
|
#
|
||||||
|
@ -953,7 +952,7 @@ class OutputFile:
|
||||||
of.close()
|
of.close()
|
||||||
|
|
||||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||||
wq_type = model[NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
||||||
|
|
||||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||||
return GGMLFileType.AllF32
|
return GGMLFileType.AllF32
|
||||||
|
|
|
@ -313,7 +313,7 @@ class ModelParams:
|
||||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
||||||
|
|
||||||
def tensor_name(key, bid=None, suffix=".weight"):
|
def tensor_name(key, bid=None, suffix=".weight"):
|
||||||
return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + suffix
|
return gguf.TENSOR_NAMES[key].format(bid=bid) + suffix
|
||||||
|
|
||||||
class Layer:
|
class Layer:
|
||||||
def __init__(self, params, lora_params, bid):
|
def __init__(self, params, lora_params, bid):
|
||||||
|
|
|
@ -332,8 +332,8 @@ static void init_model(struct llama_model * input, struct my_llama_model * model
|
||||||
|
|
||||||
assert_shape_1d(layer.attention_norm, hparams.n_embd);
|
assert_shape_1d(layer.attention_norm, hparams.n_embd);
|
||||||
assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
|
assert_shape_2d(layer.wq, hparams.n_embd, hparams.n_embd);
|
||||||
assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd);
|
assert_shape_2d(layer.wk, hparams.n_embd, hparams.n_embd_gqa());
|
||||||
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd);
|
assert_shape_2d(layer.wv, hparams.n_embd, hparams.n_embd_gqa());
|
||||||
assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
|
assert_shape_2d(layer.wo, hparams.n_embd, hparams.n_embd);
|
||||||
assert_shape_1d(layer.ffn_norm, hparams.n_embd);
|
assert_shape_1d(layer.ffn_norm, hparams.n_embd);
|
||||||
assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff);
|
assert_shape_2d(layer.w1, hparams.n_embd, hparams.n_ff);
|
||||||
|
|
|
@ -364,7 +364,7 @@ class ModelParams:
|
||||||
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
||||||
|
|
||||||
def tensor_name(key, bid=None):
|
def tensor_name(key, bid=None):
|
||||||
return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + ".weight"
|
return gguf.TENSOR_NAMES[key].format(bid=bid) + ".weight"
|
||||||
|
|
||||||
class Layer:
|
class Layer:
|
||||||
def __init__(self, params, bid):
|
def __init__(self, params, bid):
|
||||||
|
|
|
@ -1476,10 +1476,15 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
const int64_t ne12 = src1->ne[2];
|
||||||
|
const int64_t ne13 = src1->ne[3];
|
||||||
|
|
||||||
const int nb2 = dst->nb[2];
|
const int nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int nb3 = dst->nb[3];
|
||||||
|
|
||||||
|
const int64_t r2 = ne12 / ne02;
|
||||||
|
const int64_t r3 = ne13 / ne03;
|
||||||
|
|
||||||
const float alpha = 1.0f;
|
const float alpha = 1.0f;
|
||||||
const float beta = 0.0f;
|
const float beta = 0.0f;
|
||||||
const int x_ne = ne01 * ne00;
|
const int x_ne = ne01 * ne00;
|
||||||
|
@ -1498,13 +1503,22 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
||||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
int64_t pi02 = -1;
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
int64_t pi03 = -1;
|
||||||
|
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||||
|
int64_t i03 = i13 / r3;
|
||||||
|
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||||
|
int64_t i02 = i12 / r2;
|
||||||
|
|
||||||
// copy data to device
|
// copy data to device
|
||||||
if (src0->backend != GGML_BACKEND_GPU) {
|
if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||||
|
pi02 = i02;
|
||||||
|
pi03 = i03;
|
||||||
}
|
}
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||||
|
|
||||||
CL_CHECK(clFinish(queue));
|
CL_CHECK(clFinish(queue));
|
||||||
|
|
||||||
|
@ -1525,7 +1539,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1547,6 +1561,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
const int64_t ne12 = src1->ne[2];
|
||||||
|
const int64_t ne13 = src1->ne[3];
|
||||||
|
|
||||||
const int nb10 = src1->nb[0];
|
const int nb10 = src1->nb[0];
|
||||||
const int nb11 = src1->nb[1];
|
const int nb11 = src1->nb[1];
|
||||||
|
@ -1556,6 +1572,9 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
const int nb2 = dst->nb[2];
|
const int nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int nb3 = dst->nb[3];
|
||||||
|
|
||||||
|
const int64_t r2 = ne12 / ne02;
|
||||||
|
const int64_t r3 = ne13 / ne03;
|
||||||
|
|
||||||
const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
|
const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
|
||||||
const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
|
const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
|
||||||
const int x_ne = ne01 * ne00;
|
const int x_ne = ne01 * ne00;
|
||||||
|
@ -1577,32 +1596,41 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
bool src1_cont_rows = nb10 == sizeof(float);
|
bool src1_cont_rows = nb10 == sizeof(float);
|
||||||
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
int64_t pi02 = -1;
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
int64_t pi03 = -1;
|
||||||
|
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||||
|
int64_t i03 = i13 / r3;
|
||||||
|
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||||
|
int64_t i02 = i12 / r2;
|
||||||
|
|
||||||
// copy src0 to device
|
// copy src0 to device
|
||||||
if (src0->backend != GGML_BACKEND_GPU) {
|
if (src0->backend != GGML_BACKEND_GPU && (i02 != pi02 || i03 != pi03)) {
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
|
||||||
|
pi02 = i02;
|
||||||
|
pi03 = i03;
|
||||||
}
|
}
|
||||||
|
|
||||||
// convert src1 to fp16
|
// convert src1 to fp16
|
||||||
// TODO: use multiple threads
|
// TODO: use multiple threads
|
||||||
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
|
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i13 * ne12 + i12);
|
||||||
char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
|
char * src1i = (char *) src1->data + i13*nb13 + i12*nb12;
|
||||||
if (src1_cont_rows) {
|
if (src1_cont_rows) {
|
||||||
if (src1_cont_cols) {
|
if (src1_cont_cols) {
|
||||||
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (int64_t i01 = 0; i01 < ne11; i01++) {
|
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||||
ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
|
ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
for (int64_t i01 = 0; i01 < ne11; i01++) {
|
for (int64_t i11 = 0; i11 < ne11; i11++) {
|
||||||
for (int64_t i00 = 0; i00 < ne10; i00++) {
|
for (int64_t i10 = 0; i10 < ne10; i10++) {
|
||||||
// very slow due to no inlining
|
// very slow due to no inlining
|
||||||
tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
|
tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1631,7 +1659,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
// copy dst to host, then convert to float
|
// copy dst to host, then convert to float
|
||||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
|
||||||
|
|
||||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||||
|
|
||||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||||
}
|
}
|
||||||
|
@ -1652,12 +1680,17 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
const int64_t ne12 = src1->ne[2];
|
||||||
|
const int64_t ne13 = src1->ne[3];
|
||||||
|
|
||||||
const int nb2 = dst->nb[2];
|
const int nb2 = dst->nb[2];
|
||||||
const int nb3 = dst->nb[3];
|
const int nb3 = dst->nb[3];
|
||||||
const ggml_type type = src0->type;
|
const ggml_type type = src0->type;
|
||||||
const bool mul_mat_vec = ne11 == 1;
|
const bool mul_mat_vec = ne11 == 1;
|
||||||
|
|
||||||
|
const int64_t r2 = ne12 / ne02;
|
||||||
|
const int64_t r3 = ne13 / ne03;
|
||||||
|
|
||||||
const float alpha = 1.0f;
|
const float alpha = 1.0f;
|
||||||
const float beta = 0.0f;
|
const float beta = 0.0f;
|
||||||
const int x_ne = ne01 * ne00;
|
const int x_ne = ne01 * ne00;
|
||||||
|
@ -1690,12 +1723,23 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
size_t ev_idx = 0;
|
size_t ev_idx = 0;
|
||||||
std::vector<cl_event> events;
|
std::vector<cl_event> events;
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
int64_t pi02 = -1;
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
int64_t pi03 = -1;
|
||||||
|
|
||||||
|
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||||
|
int64_t i03 = i13 / r3;
|
||||||
|
|
||||||
|
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||||
|
int64_t i02 = i12 / r2;
|
||||||
|
|
||||||
// copy src0 to device if necessary
|
// copy src0 to device if necessary
|
||||||
if (src0->backend == GGML_BACKEND_CPU) {
|
if (src0->backend == GGML_BACKEND_CPU) {
|
||||||
events.emplace_back();
|
if (i02 != pi02 || i03 != pi03) {
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
events.emplace_back();
|
||||||
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
||||||
|
pi02 = i02;
|
||||||
|
pi03 = i03;
|
||||||
|
}
|
||||||
} else if (src0->backend == GGML_BACKEND_GPU) {
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
||||||
d_Q = (cl_mem) src0->extra;
|
d_Q = (cl_mem) src0->extra;
|
||||||
} else {
|
} else {
|
||||||
|
@ -1704,7 +1748,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
||||||
// copy src1 to device
|
// copy src1 to device
|
||||||
events.emplace_back();
|
events.emplace_back();
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, events.data() + ev_idx++));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++));
|
||||||
|
|
||||||
// compute
|
// compute
|
||||||
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
||||||
|
@ -1725,7 +1769,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL));
|
||||||
|
|
||||||
// copy src1 to device
|
// copy src1 to device
|
||||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL));
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
|
||||||
|
|
||||||
events.emplace_back();
|
events.emplace_back();
|
||||||
|
|
||||||
|
@ -1749,7 +1793,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy dst to host
|
// copy dst to host
|
||||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL));
|
||||||
for (auto *event : events) {
|
for (auto *event : events) {
|
||||||
clReleaseEvent(event);
|
clReleaseEvent(event);
|
||||||
|
|
5
ggml.c
5
ggml.c
|
@ -11621,11 +11621,6 @@ static void ggml_compute_forward_mul_mat(
|
||||||
|
|
||||||
#if defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
||||||
// TODO: handle case when src0 is broadcast-able into src1 across 2nd,3rd dimension
|
|
||||||
// ref: https://github.com/ggerganov/ggml/pull/224
|
|
||||||
GGML_ASSERT(ne02 == ne12);
|
|
||||||
GGML_ASSERT(ne03 == ne13);
|
|
||||||
|
|
||||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||||
}
|
}
|
||||||
|
|
|
@ -85,10 +85,12 @@ class MODEL_ARCH(IntEnum):
|
||||||
GPTNEOX : int = auto()
|
GPTNEOX : int = auto()
|
||||||
MPT : int = auto()
|
MPT : int = auto()
|
||||||
STARCODER : int = auto()
|
STARCODER : int = auto()
|
||||||
|
BERT : int = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
TOKEN_EMBD : int = auto()
|
TOKEN_EMBD : int = auto()
|
||||||
|
TOKEN_TYPES : int = auto()
|
||||||
POS_EMBD : int = auto()
|
POS_EMBD : int = auto()
|
||||||
OUTPUT : int = auto()
|
OUTPUT : int = auto()
|
||||||
OUTPUT_NORM : int = auto()
|
OUTPUT_NORM : int = auto()
|
||||||
|
@ -116,78 +118,138 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.GPTNEOX: "gptneox",
|
MODEL_ARCH.GPTNEOX: "gptneox",
|
||||||
MODEL_ARCH.MPT: "mpt",
|
MODEL_ARCH.MPT: "mpt",
|
||||||
MODEL_ARCH.STARCODER: "starcoder",
|
MODEL_ARCH.STARCODER: "starcoder",
|
||||||
|
MODEL_ARCH.BERT: "bert",
|
||||||
}
|
}
|
||||||
|
|
||||||
MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
MODEL_ARCH.LLAMA: {
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.POS_EMBD: "position_embd",
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
||||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
MODEL_TENSOR.OUTPUT: "output",
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
||||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
|
||||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
||||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
||||||
},
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
||||||
MODEL_ARCH.GPTNEOX: {
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
}
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
|
||||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
MODEL_ARCH.LLAMA: [
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.OUTPUT,
|
||||||
},
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
MODEL_ARCH.FALCON: {
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.ATTN_Q,
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.ATTN_K,
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.ATTN_V,
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
MODEL_TENSOR.FFN_NORM,
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
MODEL_TENSOR.FFN_GATE,
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.FFN_UP,
|
||||||
},
|
],
|
||||||
MODEL_ARCH.BAICHUAN: {
|
MODEL_ARCH.GPTNEOX: [
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.OUTPUT,
|
||||||
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
MODEL_TENSOR.FFN_NORM,
|
||||||
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
MODEL_TENSOR.FFN_UP,
|
||||||
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
],
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_ARCH.FALCON: [
|
||||||
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.OUTPUT,
|
||||||
},
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
MODEL_ARCH.STARCODER: {
|
MODEL_TENSOR.ATTN_NORM_2,
|
||||||
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
MODEL_TENSOR.POS_EMBD: "position_embd",
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
MODEL_TENSOR.OUTPUT: "output",
|
MODEL_TENSOR.FFN_UP,
|
||||||
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
],
|
||||||
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
MODEL_ARCH.BAICHUAN: [
|
||||||
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
MODEL_TENSOR.OUTPUT,
|
||||||
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
MODEL_TENSOR.ROPE_FREQS,
|
||||||
},
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
MODEL_ARCH.GPT2: {
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.STARCODER: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.BERT: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES,
|
||||||
|
MODEL_TENSOR.POS_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.MPT: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_QKV,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_NORM,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.GPTJ: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.OUTPUT,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
|
MODEL_ARCH.GPT2: [
|
||||||
# TODO
|
# TODO
|
||||||
},
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -208,31 +270,40 @@ class TensorNameMap:
|
||||||
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||||
# Token embeddings
|
# Token embeddings
|
||||||
MODEL_TENSOR.TOKEN_EMBD: (
|
MODEL_TENSOR.TOKEN_EMBD: (
|
||||||
"gpt_neox.embed_in", # gptneox
|
"gpt_neox.embed_in", # gptneox
|
||||||
"transformer.wte", # gpt2 mpt
|
"transformer.wte", # gpt2 gpt-j mpt
|
||||||
"transformer.word_embeddings", # falcon
|
"transformer.word_embeddings", # falcon
|
||||||
"model.embed_tokens", # llama-hf
|
"model.embed_tokens", # llama-hf
|
||||||
"tok_embeddings", # llama-pth
|
"tok_embeddings", # llama-pth
|
||||||
|
"embeddings.word_embeddings", # bert
|
||||||
|
),
|
||||||
|
|
||||||
|
# Token type embeddings
|
||||||
|
MODEL_TENSOR.TOKEN_TYPES: (
|
||||||
|
"embeddings.token_type_embeddings", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Position embeddings
|
# Position embeddings
|
||||||
MODEL_TENSOR.POS_EMBD: (
|
MODEL_TENSOR.POS_EMBD: (
|
||||||
"transformer.wpe", # gpt2
|
"transformer.wpe", # gpt2
|
||||||
|
"embeddings.position_embeddings", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Output
|
# Output
|
||||||
MODEL_TENSOR.OUTPUT: (
|
MODEL_TENSOR.OUTPUT: (
|
||||||
"embed_out", # gptneox
|
"embed_out", # gptneox
|
||||||
"lm_head", # gpt2 mpt falcon llama-hf baichuan
|
"lm_head", # gpt2 gpt-j mpt falcon llama-hf baichuan
|
||||||
"output", # llama-pth
|
"output", # llama-pth
|
||||||
),
|
),
|
||||||
|
|
||||||
# Output norm
|
# Output norm
|
||||||
MODEL_TENSOR.OUTPUT_NORM: (
|
MODEL_TENSOR.OUTPUT_NORM: (
|
||||||
"gpt_neox.final_layer_norm", # gptneox
|
"gpt_neox.final_layer_norm", # gptneox
|
||||||
"transformer.ln_f", # gpt2 falcon
|
"transformer.ln_f", # gpt2 gpt-j falcon
|
||||||
"model.norm", # llama-hf baichuan
|
"model.norm", # llama-hf baichuan
|
||||||
"norm", # llama-pth
|
"norm", # llama-pth
|
||||||
|
"embeddings.LayerNorm", # bert
|
||||||
|
"transformer.norm_f", # mpt
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rope frequencies
|
# Rope frequencies
|
||||||
|
@ -244,13 +315,14 @@ class TensorNameMap:
|
||||||
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
|
||||||
# Attention norm
|
# Attention norm
|
||||||
MODEL_TENSOR.ATTN_NORM: (
|
MODEL_TENSOR.ATTN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.input_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_1", # gpt2
|
"transformer.h.{bid}.ln_1", # gpt2 gpt-j
|
||||||
"transformer.blocks.{bid}.norm_1", # mpt
|
"transformer.blocks.{bid}.norm_1", # mpt
|
||||||
"transformer.h.{bid}.input_layernorm", # falcon7b
|
"transformer.h.{bid}.input_layernorm", # falcon7b
|
||||||
"transformer.h.{bid}.ln_mlp", # falcon40b
|
"transformer.h.{bid}.ln_mlp", # falcon40b
|
||||||
"model.layers.{bid}.input_layernorm", # llama-hf
|
"model.layers.{bid}.input_layernorm", # llama-hf
|
||||||
"layers.{bid}.attention_norm", # llama-pth
|
"layers.{bid}.attention_norm", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention norm 2
|
# Attention norm 2
|
||||||
|
@ -260,38 +332,46 @@ class TensorNameMap:
|
||||||
|
|
||||||
# Attention query-key-value
|
# Attention query-key-value
|
||||||
MODEL_TENSOR.ATTN_QKV: (
|
MODEL_TENSOR.ATTN_QKV: (
|
||||||
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
"gpt_neox.layers.{bid}.attention.query_key_value", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_attn", # gpt2
|
"transformer.h.{bid}.attn.c_attn", # gpt2
|
||||||
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
"transformer.blocks.{bid}.attn.Wqkv", # mpt
|
||||||
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
"transformer.h.{bid}.self_attention.query_key_value", # falcon
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention query
|
# Attention query
|
||||||
MODEL_TENSOR.ATTN_Q: (
|
MODEL_TENSOR.ATTN_Q: (
|
||||||
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
"model.layers.{bid}.self_attn.q_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wq", # llama-pth
|
"layers.{bid}.attention.wq", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.self.query", # bert
|
||||||
|
"transformer.h.{bid}.attn.q_proj", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention key
|
# Attention key
|
||||||
MODEL_TENSOR.ATTN_K: (
|
MODEL_TENSOR.ATTN_K: (
|
||||||
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
"model.layers.{bid}.self_attn.k_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wk", # llama-pth
|
"layers.{bid}.attention.wk", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.self.key", # bert
|
||||||
|
"transformer.h.{bid}.attn.k_proj", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention value
|
# Attention value
|
||||||
MODEL_TENSOR.ATTN_V: (
|
MODEL_TENSOR.ATTN_V: (
|
||||||
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
"model.layers.{bid}.self_attn.v_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wv", # llama-pth
|
"layers.{bid}.attention.wv", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.self.value", # bert
|
||||||
|
"transformer.h.{bid}.attn.v_proj", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Attention output
|
# Attention output
|
||||||
MODEL_TENSOR.ATTN_OUT: (
|
MODEL_TENSOR.ATTN_OUT: (
|
||||||
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
"gpt_neox.layers.{bid}.attention.dense", # gptneox
|
||||||
"transformer.h.{bid}.attn.c_proj", # gpt2
|
"transformer.h.{bid}.attn.c_proj", # gpt2
|
||||||
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
"transformer.blocks.{bid}.attn.out_proj", # mpt
|
||||||
"transformer.h.{bid}.self_attention.dense", # falcon
|
"transformer.h.{bid}.self_attention.dense", # falcon
|
||||||
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
"model.layers.{bid}.self_attn.o_proj", # llama-hf
|
||||||
"layers.{bid}.attention.wo", # llama-pth
|
"layers.{bid}.attention.wo", # llama-pth
|
||||||
|
"encoder.layer.{bid}.attention.output.dense", # bert
|
||||||
|
"transformer.h.{bid}.attn.out_proj", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Rotary embeddings
|
# Rotary embeddings
|
||||||
|
@ -302,21 +382,24 @@ class TensorNameMap:
|
||||||
|
|
||||||
# Feed-forward norm
|
# Feed-forward norm
|
||||||
MODEL_TENSOR.FFN_NORM: (
|
MODEL_TENSOR.FFN_NORM: (
|
||||||
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
"gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox
|
||||||
"transformer.h.{bid}.ln_2", # gpt2
|
"transformer.h.{bid}.ln_2", # gpt2
|
||||||
"transformer.blocks.{bid}.norm_2", # mpt
|
"transformer.blocks.{bid}.norm_2", # mpt
|
||||||
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
"model.layers.{bid}.post_attention_layernorm", # llama-hf
|
||||||
"layers.{bid}.ffn_norm", # llama-pth
|
"layers.{bid}.ffn_norm", # llama-pth
|
||||||
|
"encoder.layer.{bid}.output.LayerNorm", # bert
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward up
|
# Feed-forward up
|
||||||
MODEL_TENSOR.FFN_UP: (
|
MODEL_TENSOR.FFN_UP: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_h_to_4h", # gptneox
|
||||||
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
"transformer.h.{bid}.mlp.c_fc", # gpt2
|
||||||
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
"transformer.blocks.{bid}.ffn.up_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
"transformer.h.{bid}.mlp.dense_h_to_4h", # falcon
|
||||||
"model.layers.{bid}.mlp.up_proj", # llama-hf
|
"model.layers.{bid}.mlp.up_proj", # llama-hf
|
||||||
"layers.{bid}.feed_forward.w3", # llama-pth
|
"layers.{bid}.feed_forward.w3", # llama-pth
|
||||||
|
"encoder.layer.{bid}.intermediate.dense", # bert
|
||||||
|
"transformer.h.{bid}.mlp.fc_in", # gpt-j
|
||||||
),
|
),
|
||||||
|
|
||||||
# Feed-forward gate
|
# Feed-forward gate
|
||||||
|
@ -327,39 +410,37 @@ class TensorNameMap:
|
||||||
|
|
||||||
# Feed-forward down
|
# Feed-forward down
|
||||||
MODEL_TENSOR.FFN_DOWN: (
|
MODEL_TENSOR.FFN_DOWN: (
|
||||||
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
"gpt_neox.layers.{bid}.mlp.dense_4h_to_h", # gptneox
|
||||||
"transformer.h.{bid}.mlp.c_proj", # gpt2
|
"transformer.h.{bid}.mlp.c_proj", # gpt2
|
||||||
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
"transformer.blocks.{bid}.ffn.down_proj", # mpt
|
||||||
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
"transformer.h.{bid}.mlp.dense_4h_to_h", # falcon
|
||||||
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
"model.layers.{bid}.mlp.down_proj", # llama-hf
|
||||||
"layers.{bid}.feed_forward.w2", # llama-pth
|
"layers.{bid}.feed_forward.w2", # llama-pth
|
||||||
|
"encoder.layer.{bid}.output.dense", # bert
|
||||||
|
"transformer.h.{bid}.mlp.fc_out", # gpt-j
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
mapping: dict[str, tuple[MODEL_TENSOR, str]]
|
||||||
|
|
||||||
tensor_names: dict[MODEL_TENSOR, str]
|
|
||||||
|
|
||||||
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
|
def __init__(self, arch: MODEL_ARCH, n_blocks: int):
|
||||||
mapping = self.mapping = {}
|
self.mapping = {}
|
||||||
tensor_names = self.tensor_names = MODEL_TENSOR_NAMES[arch]
|
|
||||||
for tensor, keys in self.mappings_cfg.items():
|
for tensor, keys in self.mappings_cfg.items():
|
||||||
tensor_name = tensor_names.get(tensor)
|
if tensor not in MODEL_TENSORS[arch]:
|
||||||
if tensor_name is None:
|
|
||||||
continue
|
continue
|
||||||
mapping[tensor_name] = (tensor, tensor_name)
|
tensor_name = TENSOR_NAMES[tensor]
|
||||||
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
for key in keys:
|
for key in keys:
|
||||||
mapping[key] = (tensor, tensor_name)
|
self.mapping[key] = (tensor, tensor_name)
|
||||||
for bid in range(n_blocks):
|
for bid in range(n_blocks):
|
||||||
for tensor, keys in self.block_mappings_cfg.items():
|
for tensor, keys in self.block_mappings_cfg.items():
|
||||||
tensor_name = tensor_names.get(tensor)
|
if tensor not in MODEL_TENSORS[arch]:
|
||||||
if tensor_name is None:
|
|
||||||
continue
|
continue
|
||||||
tensor_name = tensor_name.format(bid = bid)
|
tensor_name = TENSOR_NAMES[tensor].format(bid = bid)
|
||||||
mapping[tensor_name] = (tensor, tensor_name)
|
self.mapping[tensor_name] = (tensor, tensor_name)
|
||||||
for key in keys:
|
for key in keys:
|
||||||
key = key.format(bid = bid)
|
key = key.format(bid = bid)
|
||||||
mapping[key] = (tensor, tensor_name)
|
self.mapping[key] = (tensor, tensor_name)
|
||||||
|
|
||||||
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
|
||||||
result = self.mapping.get(key)
|
result = self.mapping.get(key)
|
||||||
|
@ -800,22 +881,25 @@ class SpecialVocab:
|
||||||
special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
|
special_token_types: tuple[str, ...] = ('bos', 'eos', 'unk', 'sep', 'pad')
|
||||||
special_token_ids: dict[str, int] = {}
|
special_token_ids: dict[str, int] = {}
|
||||||
|
|
||||||
def __init__(self, path: Path, load_merges: bool = False, special_token_types: tuple[str, ...] | None = None):
|
def __init__(
|
||||||
|
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||||
|
special_token_types: tuple[str, ...] | None = None,
|
||||||
|
):
|
||||||
self.special_token_ids = {}
|
self.special_token_ids = {}
|
||||||
self.load_merges = load_merges
|
self.load_merges = load_merges
|
||||||
if special_token_types is not None:
|
if special_token_types is not None:
|
||||||
self.special_token_types = special_token_types
|
self.special_token_types = special_token_types
|
||||||
self.load(path)
|
self._load(Path(path))
|
||||||
|
|
||||||
def load(self, path: Path):
|
def _load(self, path: Path) -> None:
|
||||||
if not self.try_load_from_tokenizer_json(path):
|
if not self._try_load_from_tokenizer_json(path):
|
||||||
self.try_load_from_config_json(path)
|
self._try_load_from_config_json(path)
|
||||||
|
|
||||||
def try_load_from_tokenizer_json(self, path: Path) -> bool:
|
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||||
tokenizer_file = path / 'tokenizer.json'
|
tokenizer_file = path / 'tokenizer.json'
|
||||||
if not tokenizer_file.is_file():
|
if not tokenizer_file.is_file():
|
||||||
return False
|
return False
|
||||||
with open(tokenizer_file, 'r', encoding = 'utf-8') as f:
|
with open(tokenizer_file, encoding = 'utf-8') as f:
|
||||||
tokenizer = json.load(f)
|
tokenizer = json.load(f)
|
||||||
if self.load_merges:
|
if self.load_merges:
|
||||||
merges = tokenizer.get('model', {}).get('merges')
|
merges = tokenizer.get('model', {}).get('merges')
|
||||||
|
@ -825,7 +909,7 @@ class SpecialVocab:
|
||||||
added_tokens = tokenizer.get('added_tokens')
|
added_tokens = tokenizer.get('added_tokens')
|
||||||
if added_tokens is None or not tokenizer_config_file.is_file():
|
if added_tokens is None or not tokenizer_config_file.is_file():
|
||||||
return True
|
return True
|
||||||
with open(tokenizer_config_file, 'r', encoding = 'utf-8') as f:
|
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
||||||
tokenizer_config = json.load(f)
|
tokenizer_config = json.load(f)
|
||||||
for typ in self.special_token_types:
|
for typ in self.special_token_types:
|
||||||
entry = tokenizer_config.get(f'{typ}_token')
|
entry = tokenizer_config.get(f'{typ}_token')
|
||||||
|
@ -844,11 +928,11 @@ class SpecialVocab:
|
||||||
break
|
break
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def try_load_from_config_json(self, path: Path) -> bool:
|
def _try_load_from_config_json(self, path: Path) -> bool:
|
||||||
config_file = path / 'config.json'
|
config_file = path / 'config.json'
|
||||||
if not config_file.is_file():
|
if not config_file.is_file():
|
||||||
return False
|
return False
|
||||||
with open(config_file, 'r', encoding = 'utf-8') as f:
|
with open(config_file, encoding = 'utf-8') as f:
|
||||||
config = json.load(f)
|
config = json.load(f)
|
||||||
for typ in self.special_token_types:
|
for typ in self.special_token_types:
|
||||||
maybe_token_id = config.get(f'{typ}_token_id')
|
maybe_token_id = config.get(f'{typ}_token_id')
|
||||||
|
@ -856,7 +940,7 @@ class SpecialVocab:
|
||||||
self.special_token_ids[typ] = maybe_token_id
|
self.special_token_ids[typ] = maybe_token_id
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def add_to_gguf(self, gw: GGUFWriter):
|
def add_to_gguf(self, gw: GGUFWriter) -> None:
|
||||||
if len(self.merges) > 0:
|
if len(self.merges) > 0:
|
||||||
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
print(f'gguf: Adding {len(self.merges)} merge(s).')
|
||||||
gw.add_token_merges(self.merges)
|
gw.add_token_merges(self.merges)
|
||||||
|
@ -868,8 +952,8 @@ class SpecialVocab:
|
||||||
print(f'gguf: Setting special token type {typ} to {tokid}')
|
print(f'gguf: Setting special token type {typ} to {tokid}')
|
||||||
handler(tokid)
|
handler(tokid)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self) -> str:
|
||||||
return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids if self.special_token_ids else "unset"}>'
|
return f'<SpecialVocab with {len(self.merges)} merges and special tokens {self.special_token_ids or "unset"}>'
|
||||||
|
|
||||||
|
|
||||||
# Example usage:
|
# Example usage:
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.3.3"
|
version = "0.4.0"
|
||||||
description = "Write ML models in GGUF for GGML"
|
description = "Write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue