remove old filever checks (+7 squashed commit)
Squashed commit: [b72627a] new format not working [e568870] old ver works [7053b77] compile errors fixed, fixing linkers [4ae8889] add new ver [ff82dfd] file format checks [25b8aa8] refactoring type names [931063b] still merging
This commit is contained in:
parent
417302b226
commit
c048bcfec4
25 changed files with 3180 additions and 506 deletions
|
@ -285,6 +285,14 @@ target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump
|
|||
target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||
set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
add_library(ggml_v2 OBJECT
|
||||
otherarch/ggml_v2.c
|
||||
otherarch/ggml_v2.h)
|
||||
target_include_directories(ggml_v2 PUBLIC . ./otherarch ./otherarch/tools)
|
||||
target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump
|
||||
target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||
set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
add_library(common2
|
||||
examples/common.cpp
|
||||
examples/common.h)
|
||||
|
|
32
Makefile
32
Makefile
|
@ -228,7 +228,21 @@ ggml_clblast.o: ggml.c ggml.h
|
|||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@
|
||||
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
||||
$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||
ggml-opencl-legacy.o: ggml-opencl-legacy.c ggml-opencl-legacy.h
|
||||
|
||||
#version 2 libs
|
||||
ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@
|
||||
ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(OPENBLAS_FLAGS) -c $< -o $@
|
||||
ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
ggml_v2_openblas_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
||||
$(CC) $(CFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
|
||||
ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
|
||||
$(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@
|
||||
ggml_v2-opencl.o: otherarch/ggml_v2-opencl.cpp otherarch/ggml_v2-opencl.h
|
||||
$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
|
||||
ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-opencl-legacy.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
#extreme old version compat
|
||||
|
@ -264,19 +278,15 @@ main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
|||
@echo '==== Run ./main -h for help. ===='
|
||||
@echo
|
||||
|
||||
koboldcpp: ggml.o ggml_v1.o expose.o common.o gpttype_adapter.o $(OBJS)
|
||||
koboldcpp: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(OBJS)
|
||||
$(DEFAULT_BUILD)
|
||||
|
||||
koboldcpp_openblas: ggml_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o
|
||||
$(OPENBLAS_BUILD)
|
||||
|
||||
koboldcpp_noavx2: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o
|
||||
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o
|
||||
$(OPENBLAS_BUILD)
|
||||
koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o
|
||||
$(NOAVX2_BUILD)
|
||||
|
||||
koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o
|
||||
koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o
|
||||
$(OPENBLAS_NOAVX2_BUILD)
|
||||
|
||||
koboldcpp_clblast: ggml_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml-opencl-legacy.o
|
||||
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o
|
||||
$(CLBLAST_BUILD)
|
||||
|
||||
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o
|
||||
|
|
|
@ -63,7 +63,7 @@ extern "C"
|
|||
putenv((char*)deviceenv.c_str());
|
||||
executable_path = inputs.executable_path;
|
||||
|
||||
if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
|
||||
if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4 || file_format==FileFormat::GPTJ_5)
|
||||
{
|
||||
printf("\n---\nIdentified as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
|
||||
|
@ -141,7 +141,7 @@ extern "C"
|
|||
return true;
|
||||
}
|
||||
}
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5|| file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
||||
{
|
||||
printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format);
|
||||
ModelLoadResult lr = gpttype_load_model(inputs, file_format);
|
||||
|
|
127
ggml-opencl.cpp
127
ggml-opencl.cpp
|
@ -6,6 +6,7 @@
|
|||
|
||||
#define CL_TARGET_OPENCL_VERSION 110
|
||||
#include <clblast.h>
|
||||
#include <clblast_c.h>
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
@ -16,55 +17,45 @@
|
|||
#define CL_DMMV_BLOCK_SIZE 32;
|
||||
|
||||
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
||||
std::string program_source = MULTILINE_QUOTE(
|
||||
static std::string program_source = MULTILINE_QUOTE(
|
||||
|
||||
typedef char int8_t;
|
||||
typedef uchar uint8_t;
|
||||
typedef int int32_t;
|
||||
typedef uint uint32_t;
|
||||
|
||||
constant uint QK4_0 = 32;
|
||||
constant uint QR4_0 = 2;
|
||||
struct block_q4_0
|
||||
{
|
||||
half d;
|
||||
uint8_t qs[QK4_0 / 2];
|
||||
uint8_t qs[16];
|
||||
};
|
||||
|
||||
constant uint QK4_1 = 32;
|
||||
constant uint QR4_1 = 2;
|
||||
struct block_q4_1
|
||||
{
|
||||
half d;
|
||||
half m;
|
||||
uint8_t qs[QK4_1 / 2];
|
||||
uint8_t qs[16];
|
||||
};
|
||||
|
||||
constant uint QK5_0 = 32;
|
||||
constant uint QR5_0 = 2;
|
||||
struct __attribute__ ((packed)) block_q5_0
|
||||
{
|
||||
half d;
|
||||
uint32_t qh;
|
||||
uint8_t qs[QK5_0 / 2];
|
||||
uint8_t qs[16];
|
||||
};
|
||||
|
||||
constant uint QK5_1 = 32;
|
||||
constant uint QR5_1 = 2;
|
||||
struct block_q5_1
|
||||
{
|
||||
half d;
|
||||
half m;
|
||||
uint32_t qh;
|
||||
uint8_t qs[QK5_1 / 2];
|
||||
uint8_t qs[16];
|
||||
};
|
||||
|
||||
constant uint QK8_0 = 32;
|
||||
constant uint QR8_0 = 1;
|
||||
struct block_q8_0
|
||||
{
|
||||
half d;
|
||||
uint8_t qs[QK8_0];
|
||||
uint8_t qs[32];
|
||||
};
|
||||
|
||||
|
||||
|
@ -135,13 +126,13 @@ void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const in
|
|||
*v0 = vi0*d;
|
||||
*v1 = vi1*d;
|
||||
}
|
||||
void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
|
||||
static void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
|
||||
*v0 = vload_half(0, &x[ib + 0]);
|
||||
*v1 = vload_half(0, &x[ib + 1]);
|
||||
}
|
||||
);
|
||||
|
||||
std::string dequant_template = MULTILINE_QUOTE(
|
||||
static std::string dequant_template = MULTILINE_QUOTE(
|
||||
__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
||||
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
|
||||
|
||||
|
@ -165,7 +156,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
|||
}
|
||||
);
|
||||
|
||||
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
||||
static std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
||||
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
||||
const int block_size = get_local_size(0);
|
||||
const int row = get_global_id(0) / block_size;
|
||||
|
@ -207,29 +198,29 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|||
}
|
||||
);
|
||||
|
||||
std::array<std::string, 5> dequant_str_keys = {
|
||||
static std::array<std::string, 5> dequant_str_keys = {
|
||||
"KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
|
||||
};
|
||||
|
||||
std::array<std::string, 30> dequant_str_values = {
|
||||
"dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
|
||||
"dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
|
||||
"dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
|
||||
"dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
|
||||
"dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
|
||||
static std::array<std::string, 30> dequant_str_values = {
|
||||
"dequantize_row_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0",
|
||||
"dequantize_row_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1",
|
||||
"dequantize_row_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0",
|
||||
"dequantize_row_q5_1", "struct block_q5_1", "32", "2", "dequantize_q5_1",
|
||||
"dequantize_row_q8_0", "struct block_q8_0", "32", "1", "dequantize_q8_0",
|
||||
"convert_row_f16", "half", "1", "1", "convert_f16"
|
||||
};
|
||||
|
||||
std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
|
||||
"dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0",
|
||||
"dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1",
|
||||
"dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0",
|
||||
"dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1",
|
||||
"dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0",
|
||||
static std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
|
||||
"dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0",
|
||||
"dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1",
|
||||
"dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0",
|
||||
"dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "32", "2", "dequantize_q5_1",
|
||||
"dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "32", "1", "dequantize_q8_0",
|
||||
"convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
|
||||
};
|
||||
|
||||
std::string& replace(std::string& s, const std::string& from, const std::string& to) {
|
||||
static std::string& sreplace(std::string& s, const std::string& from, const std::string& to) {
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(from, pos)) != std::string::npos) {
|
||||
s.replace(pos, from.length(), to);
|
||||
|
@ -238,15 +229,15 @@ std::string& replace(std::string& s, const std::string& from, const std::string&
|
|||
return s;
|
||||
}
|
||||
|
||||
std::string generate_kernels() {
|
||||
static std::string generate_kernels() {
|
||||
std::stringstream src;
|
||||
src << program_source << '\n';
|
||||
for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) {
|
||||
std::string dequant_kernel = dequant_template;
|
||||
std::string dmmv_kernel = dequant_mul_mat_vec_template;
|
||||
for (size_t j = 0; j < dequant_str_keys.size(); j++) {
|
||||
replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
|
||||
replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
|
||||
sreplace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
|
||||
sreplace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
|
||||
}
|
||||
src << dequant_kernel << '\n';
|
||||
src << dmmv_kernel << '\n';
|
||||
|
@ -259,6 +250,7 @@ std::string generate_kernels() {
|
|||
cl_int err_ = (err); \
|
||||
if (err_ != CL_SUCCESS) { \
|
||||
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
|
||||
fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n"); \
|
||||
exit(1); \
|
||||
} \
|
||||
} while (0)
|
||||
|
@ -271,7 +263,7 @@ static cl_program program;
|
|||
static cl_kernel convert_row_f16_cl;
|
||||
static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl;
|
||||
static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl;
|
||||
static bool fp16_support;
|
||||
static bool fp16_support = false;
|
||||
|
||||
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
||||
cl_program p;
|
||||
|
@ -339,6 +331,8 @@ void ggml_cl_init(void) {
|
|||
}
|
||||
free(ext_buffer);
|
||||
printf("Using Platform: %s Device: %s FP16: %d\n", platform_buffer, device_buffer, fp16_support);
|
||||
fp16_support = false;
|
||||
printf("CL FP16 temporarily disabled pending further optimization.\n");
|
||||
context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
|
||||
CL_CHECK(err, "clCreateContext");
|
||||
queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
|
||||
|
@ -552,17 +546,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
|
||||
// compute
|
||||
cl_event ev_sgemm;
|
||||
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, 0, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
|
||||
(CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, 0, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
printf("\nF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
|
@ -650,18 +645,19 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
|
||||
// compute
|
||||
cl_event ev_sgemm;
|
||||
clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, 0, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
clblast::StatusCode status = (clblast::StatusCode)CLBlastHgemm((CLBlastLayout)clblast::Layout::kColMajor,
|
||||
(CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, 0, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
GGML_ASSERT(false);
|
||||
printf("\nF16 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
// copy dst to host, then convert to float
|
||||
|
@ -757,17 +753,18 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|||
CL_CHECK(clFinish(queue), "clFinish");
|
||||
|
||||
// compute
|
||||
clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor,
|
||||
clblast::Transpose::kYes, clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, 0, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor,
|
||||
(CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo,
|
||||
ne01, ne11, ne10,
|
||||
alpha,
|
||||
d_X, 0, ne00,
|
||||
d_Y, 0, ne10,
|
||||
beta,
|
||||
d_D, 0, ne01,
|
||||
&queue, &ev_sgemm);
|
||||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
printf("\nQF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,22 @@
|
|||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_cl_init(void);
|
||||
|
||||
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
|
||||
void * ggml_cl_host_malloc(size_t size);
|
||||
void ggml_cl_host_free(void * ptr);
|
||||
|
||||
void ggml_cl_transform_tensor(struct ggml_tensor * tensor);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
91
ggml.c
91
ggml.c
|
@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) {
|
|||
return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32));
|
||||
}
|
||||
|
||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
||||
#if __AVX2__ || __AVX512F__
|
||||
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
||||
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
||||
uint32_t x32;
|
||||
|
@ -688,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|||
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
||||
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
#if __ARM_NEON
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
|
@ -2481,7 +2481,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
|
|||
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
||||
}
|
||||
|
||||
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
||||
sumf += (GGML_FP16_TO_FP32(x[i]).d*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
||||
}
|
||||
|
||||
*s = sumf;
|
||||
|
@ -9289,7 +9289,7 @@ static void ggml_compute_forward_rms_norm_back(
|
|||
|
||||
// ggml_compute_forward_mul_mat
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
// helper function to determine if it is better to use BLAS or not
|
||||
// for large matrices, BLAS is faster
|
||||
static bool ggml_compute_forward_mul_mat_use_blas(
|
||||
|
@ -9330,7 +9330,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
#endif
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
|
@ -9394,9 +9394,16 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
}
|
||||
return;
|
||||
}
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
|
@ -9416,21 +9423,11 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
// zT = y * xT
|
||||
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
0.0f, d, ne01,
|
||||
GGML_TYPE_F32);
|
||||
#else
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
||||
|
@ -9569,9 +9566,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
}
|
||||
return;
|
||||
}
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
|
@ -9601,20 +9605,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
assert(id*sizeof(float) <= params->wsize);
|
||||
}
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
const float * x = wdata;
|
||||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
// zT = y * xT
|
||||
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
0.0f, d, ne01,
|
||||
GGML_TYPE_F32);
|
||||
#else
|
||||
const float * x = wdata;
|
||||
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
|
||||
|
||||
|
@ -9626,7 +9616,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
1.0f, y, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -9789,9 +9778,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
}
|
||||
return;
|
||||
}
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||
ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
|
@ -9814,9 +9810,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
|
||||
#else
|
||||
{
|
||||
size_t id = 0;
|
||||
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
||||
|
@ -9828,23 +9821,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
}
|
||||
|
||||
const float * x = wdata;
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
// zT = y * xT
|
||||
ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
0.0f, d, ne01,
|
||||
type);
|
||||
#else
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -13941,9 +13923,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||
cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
|
||||
}
|
||||
else
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||
// the threads are still spinning
|
||||
cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1; // TODO: this actually is doing nothing
|
||||
// the threads are still spinning
|
||||
|
@ -13957,13 +13946,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
|||
#endif
|
||||
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
|
||||
cur = 0;
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1;
|
||||
}
|
||||
#endif
|
||||
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
|
||||
node->n_tasks = 1;
|
||||
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
|
||||
|
@ -15468,4 +15457,4 @@ int ggml_cpu_has_vsx(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////
|
1
ggml.h
1
ggml.h
|
@ -249,6 +249,7 @@ extern "C" {
|
|||
enum ggml_backend {
|
||||
GGML_BACKEND_CPU = 0,
|
||||
GGML_BACKEND_CUDA = 1,
|
||||
GGML_BACKEND_CL = 2,
|
||||
};
|
||||
|
||||
// model file types
|
||||
|
|
|
@ -12,30 +12,42 @@
|
|||
#include "otherarch.h"
|
||||
|
||||
//for easier compilation
|
||||
#include "llama_v2.cpp"
|
||||
|
||||
//concat source files into one file for compilation purposes
|
||||
#include "llama_v2.cpp"
|
||||
#include "llama.cpp"
|
||||
#include "utils.cpp"
|
||||
#include "gptj_v1.cpp"
|
||||
#include "gptj_v2.cpp"
|
||||
#include "gptj_v3.cpp"
|
||||
#include "gpt2_v1.cpp"
|
||||
#include "gpt2_v2.cpp"
|
||||
#include "gpt2_v3.cpp"
|
||||
#include "rwkv_v2.cpp"
|
||||
#include "neox_v2.cpp"
|
||||
#include "neox_v3.cpp"
|
||||
|
||||
|
||||
//return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt)
|
||||
static FileFormat file_format = FileFormat::BADFORMAT;
|
||||
|
||||
static gpt_vocab vocab;
|
||||
static gptj_model_v1 gptj_ctx_v1;
|
||||
static gptj_model gptj_ctx_v2;
|
||||
|
||||
static gptj_v1_model gptj_ctx_v1;
|
||||
static gptj_v2_model gptj_ctx_v2;
|
||||
static gptj_model gptj_ctx_v3;
|
||||
|
||||
static gpt2_v1_model gpt2_ctx_v1;
|
||||
static gpt2_model gpt2_ctx_v2;
|
||||
static gpt_neox_model neox_ctx;
|
||||
static gpt2_v2_model gpt2_ctx_v2;
|
||||
static gpt2_model gpt2_ctx_v3;
|
||||
|
||||
static gpt_neox_v2_model neox_ctx_v2;
|
||||
static gpt_neox_model neox_ctx_v3;
|
||||
|
||||
static rwkv_context * rwkv_ctx_v1;
|
||||
static llama_v2_context_params llama_ctx_params_v2;
|
||||
static llama_context_params llama_ctx_params;
|
||||
static llama_context * llama_ctx_v1;
|
||||
static llama_v2_context * llama_ctx_v2;
|
||||
static llama_context * llama_ctx_v3;
|
||||
|
||||
static gpt_params params;
|
||||
static int n_past = 0;
|
||||
|
@ -81,7 +93,6 @@ inline bool LogitsDuplicated(std::vector<float> & arr1, std::vector<float> & arr
|
|||
|
||||
llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng)
|
||||
{
|
||||
const int64_t t_start_sample_us = ggml_time_us();
|
||||
llama_sample_softmax(nullptr, candidates);
|
||||
std::vector<float> probs;
|
||||
probs.reserve(candidates->size);
|
||||
|
@ -220,7 +231,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
blasbatchsize = inputs.blasbatchsize;
|
||||
params.memory_f16 = inputs.f16_kv;
|
||||
params.n_ctx = inputs.max_context_length;
|
||||
neox_ctx.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = params.n_ctx;
|
||||
|
||||
neox_ctx_v2.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx
|
||||
= neox_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = params.n_ctx;
|
||||
|
||||
printf("System Info: %s\n", llama_print_system_info());
|
||||
SetQuantsUnshuffled(false);
|
||||
|
@ -229,33 +242,31 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
//newer format has bit unshuffling
|
||||
SetQuantsUnshuffled(file_format == FileFormat::GGJT_2);
|
||||
|
||||
llama_ctx_params = llama_context_default_params();
|
||||
llama_ctx_params.n_ctx = inputs.max_context_length;
|
||||
llama_ctx_params_v2 = llama_v2_context_default_params();
|
||||
llama_ctx_params_v2.n_ctx = inputs.max_context_length;
|
||||
//llama_ctx_params.n_parts = -1;
|
||||
llama_ctx_params.seed = -1;
|
||||
llama_ctx_params.f16_kv = inputs.f16_kv;
|
||||
llama_ctx_params.logits_all = false;
|
||||
llama_ctx_params.use_mmap = inputs.use_mmap;
|
||||
llama_ctx_params.use_mlock = inputs.use_mlock;
|
||||
llama_ctx_params.n_gpu_layers = inputs.gpulayers;
|
||||
llama_ctx_params_v2.seed = -1;
|
||||
llama_ctx_params_v2.f16_kv = inputs.f16_kv;
|
||||
llama_ctx_params_v2.logits_all = false;
|
||||
llama_ctx_params_v2.use_mmap = inputs.use_mmap;
|
||||
llama_ctx_params_v2.use_mlock = inputs.use_mlock;
|
||||
llama_ctx_params_v2.n_gpu_layers = inputs.gpulayers;
|
||||
|
||||
llama_ctx_v1 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
|
||||
llama_ctx_v2 = llama_v2_init_from_file(modelname.c_str(), llama_ctx_params_v2);
|
||||
|
||||
if (llama_ctx_v1 == NULL)
|
||||
if (llama_ctx_v2 == NULL)
|
||||
{
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
if (file_format < FileFormat::GGJT_2)
|
||||
{
|
||||
printf("\n---\nWarning: Your model may be an OUTDATED format (ver %d). Please reconvert it for better results!\n---\n", file_format);
|
||||
}
|
||||
|
||||
|
||||
printf("\n---\nWarning: Your model may be an OUTDATED format (ver %d). Please reconvert it for better results!\n---\n", file_format);
|
||||
|
||||
if (lora_filename != "")
|
||||
{
|
||||
printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
|
||||
|
||||
int err = llama_apply_lora_from_file(llama_ctx_v1,
|
||||
int err = llama_v2_apply_lora_from_file(llama_ctx_v2,
|
||||
lora_filename.c_str(),
|
||||
NULL,
|
||||
n_threads);
|
||||
|
@ -268,9 +279,47 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
|
||||
//determine mem per token
|
||||
const std::vector<int> tmp = {1, 2, 3, 4};
|
||||
llama_eval(llama_ctx_v1, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||
llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
else if(file_format == FileFormat::GGJT_3)
|
||||
{
|
||||
llama_ctx_params = llama_context_default_params();
|
||||
llama_ctx_params.n_ctx = inputs.max_context_length;
|
||||
//llama_ctx_paran_parts = -1;
|
||||
llama_ctx_params.seed = -1;
|
||||
llama_ctx_params.f16_kv = inputs.f16_kv;
|
||||
llama_ctx_params.logits_all = false;
|
||||
llama_ctx_params.use_mmap = inputs.use_mmap;
|
||||
llama_ctx_params.use_mlock = inputs.use_mlock;
|
||||
llama_ctx_params.n_gpu_layers = inputs.gpulayers;
|
||||
|
||||
llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
|
||||
|
||||
if (llama_ctx_v3 == NULL)
|
||||
{
|
||||
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
if (lora_filename != "")
|
||||
{
|
||||
printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str());
|
||||
|
||||
int err = llama_apply_lora_from_file(llama_ctx_v3,
|
||||
lora_filename.c_str(),
|
||||
NULL,
|
||||
n_threads);
|
||||
if (err != 0)
|
||||
{
|
||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
//determine mem per token
|
||||
const std::vector<int> tmp = {1, 2, 3, 4};
|
||||
llama_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads);
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
else if (file_format == FileFormat::RWKV_1)
|
||||
{
|
||||
|
@ -329,25 +378,45 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
else if (file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
|
||||
else if (file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3 || file_format==FileFormat::GPT2_4)
|
||||
{
|
||||
//newer format has bit unshuffling
|
||||
SetQuantsUnshuffled(file_format == FileFormat::GPT2_3);
|
||||
|
||||
ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v2, vocab, file_format, inputs.gpulayers);
|
||||
if(res==ModelLoadResult::FAIL)
|
||||
if(file_format==FileFormat::GPT2_4)
|
||||
{
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||
return res;
|
||||
}
|
||||
else if(res==ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
printf("\nTensor Transposition Detected! Retrying GPT-2 model loading...");
|
||||
return res;
|
||||
ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v3, vocab, file_format, inputs.gpulayers);
|
||||
if(res==ModelLoadResult::FAIL)
|
||||
{
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||
return res;
|
||||
}
|
||||
else if(res==ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
printf("\nTensor Transposition Detected! Retrying GPT-2 model loading...");
|
||||
return res;
|
||||
}
|
||||
// determine the required inference memory per token:
|
||||
gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
//newer format has bit unshuffling
|
||||
SetQuantsUnshuffled(file_format == FileFormat::GPT2_3);
|
||||
|
||||
ModelLoadResult res = gpt2_v2_model_load(params.model, gpt2_ctx_v2, vocab, file_format, inputs.gpulayers);
|
||||
if(res==ModelLoadResult::FAIL)
|
||||
{
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||
return res;
|
||||
}
|
||||
else if(res==ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
printf("\nTensor Transposition Detected! Retrying GPT-2 model loading...");
|
||||
return res;
|
||||
}
|
||||
// determine the required inference memory per token:
|
||||
gpt2_v2_eval(gpt2_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
// determine the required inference memory per token:
|
||||
gpt2_eval(gpt2_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format);
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2)
|
||||
{
|
||||
|
@ -375,82 +444,146 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
|||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
|
||||
else if(file_format == FileFormat::GPTJ_3 || file_format == FileFormat::GPTJ_4 || file_format == FileFormat::GPTJ_5)
|
||||
{
|
||||
//newer format has bit unshuffling
|
||||
SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5);
|
||||
|
||||
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx, vocab, file_format);
|
||||
if(res==ModelLoadResult::FAIL)
|
||||
if(file_format == FileFormat::GPTJ_5)
|
||||
{
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||
return res;
|
||||
}
|
||||
else if(res==ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading...");
|
||||
return res;
|
||||
}
|
||||
|
||||
// determine the required inference memory per token:
|
||||
gpt_neox_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
||||
|
||||
if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0]))
|
||||
{
|
||||
//run the black magic eval to determine if it's redpajama. VERY UGLY HACK!
|
||||
std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7");
|
||||
auto orig_par_res = neox_ctx.hparams.par_res;
|
||||
neox_ctx.hparams.par_res = 0; //test with residual false
|
||||
gpt_neox_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token);
|
||||
neox_ctx.hparams.par_res = orig_par_res;
|
||||
int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
|
||||
std::string predicted = vocab.id_to_token[topid].c_str();
|
||||
auto findresult = predicted.find("8");
|
||||
if(findresult != std::string::npos && findresult<2)
|
||||
ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v3, vocab, inputs.gpulayers);
|
||||
if (loadresult == ModelLoadResult::FAIL)
|
||||
{
|
||||
printf("\n---\nOld RedPajama NeoX Detected! Switching to new format! (use_parallel_residual=False)\n");
|
||||
ggml_free(neox_ctx.ctx);
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||
return loadresult;
|
||||
}
|
||||
else if (loadresult == ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
printf("\nTensor Transposition Detected! Retrying GPT-J model loading...");
|
||||
return loadresult;
|
||||
}
|
||||
|
||||
// determine the required inference memory per token:
|
||||
gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
||||
|
||||
//if the logits are NAN or duplicated, it means the model is incompatible
|
||||
std::vector<float> oldlogits(logits);
|
||||
|
||||
//this is another hack because they change the library - we run the eval through the model
|
||||
//twice and compare logits. if they give the same logits for different inputs, model is broken
|
||||
gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
|
||||
|
||||
if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
|
||||
{
|
||||
printf("\nBad Logits detected! Retrying GPT-J model loading...");
|
||||
ggml_free(gptj_ctx_v3.ctx);
|
||||
return ModelLoadResult::RETRY_LOAD;
|
||||
}
|
||||
}
|
||||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
//newer format has bit unshuffling
|
||||
SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4);
|
||||
|
||||
ModelLoadResult loadresult = gptj_v2_model_load(params.model, gptj_ctx_v2, vocab, inputs.gpulayers);
|
||||
if (loadresult == ModelLoadResult::FAIL)
|
||||
{
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||
return loadresult;
|
||||
}
|
||||
else if (loadresult == ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
printf("\nTensor Transposition Detected! Retrying GPT-J model loading...");
|
||||
return loadresult;
|
||||
}
|
||||
|
||||
// determine the required inference memory per token:
|
||||
gptj_v2_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
||||
|
||||
//if the logits are NAN or duplicated, it means the model is incompatible
|
||||
std::vector<float> oldlogits(logits);
|
||||
|
||||
//this is another hack because they change the library - we run the eval through the model
|
||||
//twice and compare logits. if they give the same logits for different inputs, model is broken
|
||||
gptj_v2_eval(gptj_ctx_v2, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
|
||||
|
||||
if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
|
||||
{
|
||||
printf("\nBad Logits detected! Retrying GPT-J model loading...");
|
||||
ggml_v2_free(gptj_ctx_v2.ctx);
|
||||
return ModelLoadResult::RETRY_LOAD;
|
||||
}
|
||||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
}
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5|| file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
||||
{
|
||||
if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
||||
{
|
||||
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format);
|
||||
if(res==ModelLoadResult::FAIL)
|
||||
{
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||
return res;
|
||||
}
|
||||
else if(res==ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading...");
|
||||
return res;
|
||||
}
|
||||
|
||||
// determine the required inference memory per token:
|
||||
gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
||||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
else
|
||||
{
|
||||
//newer format has bit unshuffling
|
||||
SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5);
|
||||
|
||||
ModelLoadResult res = gpt_neox_v2_model_load(params.model, neox_ctx_v2, vocab, file_format);
|
||||
if(res==ModelLoadResult::FAIL)
|
||||
{
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||
return res;
|
||||
}
|
||||
else if(res==ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading...");
|
||||
return res;
|
||||
}
|
||||
|
||||
// determine the required inference memory per token:
|
||||
gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
||||
|
||||
if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0]))
|
||||
{
|
||||
//run the black magic eval to determine if it's redpajama. VERY UGLY HACK!
|
||||
std::vector<int> test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7");
|
||||
auto orig_par_res = neox_ctx_v2.hparams.par_res;
|
||||
neox_ctx_v2.hparams.par_res = 0; //test with residual false
|
||||
gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, 0, test_embd, logits, mem_per_token);
|
||||
neox_ctx_v2.hparams.par_res = orig_par_res;
|
||||
int topid = std::max_element(logits.begin(),logits.end())-logits.begin();
|
||||
std::string predicted = vocab.id_to_token[topid].c_str();
|
||||
auto findresult = predicted.find("8");
|
||||
if(findresult != std::string::npos && findresult<2)
|
||||
{
|
||||
printf("\n---\nOld RedPajama NeoX Detected! Switching to new format! (use_parallel_residual=False)\n");
|
||||
ggml_v2_free(neox_ctx_v2.ctx);
|
||||
return ModelLoadResult::RETRY_LOAD;
|
||||
}
|
||||
}
|
||||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
//newer format has bit unshuffling
|
||||
SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4);
|
||||
|
||||
ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v2, vocab, inputs.gpulayers);
|
||||
if (loadresult == ModelLoadResult::FAIL)
|
||||
{
|
||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||
return loadresult;
|
||||
}
|
||||
else if (loadresult == ModelLoadResult::RETRY_LOAD)
|
||||
{
|
||||
printf("\nTensor Transposition Detected! Retrying GPT-J model loading...");
|
||||
return loadresult;
|
||||
}
|
||||
|
||||
// determine the required inference memory per token:
|
||||
gptj_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
||||
|
||||
//if the logits are NAN or duplicated, it means the model is incompatible
|
||||
std::vector<float> oldlogits(logits);
|
||||
|
||||
//this is another hack because they change the library - we run the eval through the model
|
||||
//twice and compare logits. if they give the same logits for different inputs, model is broken
|
||||
gptj_eval(gptj_ctx_v2, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token);
|
||||
|
||||
if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits)))
|
||||
{
|
||||
printf("\nBad Logits detected! Retrying GPT-J model loading...");
|
||||
ggml_free(gptj_ctx_v2.ctx);
|
||||
return ModelLoadResult::RETRY_LOAD;
|
||||
}
|
||||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
printf("\nUnknown Model, cannot load.\n");
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -501,16 +634,20 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
// tokenize the prompt
|
||||
std::vector<int> embd_inp;
|
||||
|
||||
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
|
||||
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3)
|
||||
{
|
||||
params.prompt.insert(0, 1, ' ');
|
||||
if (file_format == FileFormat::GGML)
|
||||
if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 )
|
||||
{
|
||||
embd_inp = ::legacy_llama_tokenize(llama_ctx_v1, params.prompt, true);
|
||||
embd_inp = ::llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
|
||||
}
|
||||
else if (file_format == FileFormat::GGML)
|
||||
{
|
||||
embd_inp = ::legacy_llama_v2_tokenize(llama_ctx_v2, params.prompt, true);
|
||||
}
|
||||
else
|
||||
{
|
||||
embd_inp = ::llama_tokenize(llama_ctx_v1, params.prompt, true);
|
||||
embd_inp = ::llama_tokenize(llama_ctx_v3, params.prompt, true);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -560,7 +697,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
{
|
||||
//for non llama, limit to 256
|
||||
int bbs = blasbatchsize;
|
||||
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2)
|
||||
if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3)
|
||||
{
|
||||
bbs = (blasbatchsize > 256 ? 256 : blasbatchsize);
|
||||
}
|
||||
|
@ -592,7 +729,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
|
||||
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
|
||||
{
|
||||
n_vocab = llama_n_vocab(llama_ctx_v1);
|
||||
n_vocab = llama_v2_n_vocab(llama_ctx_v2);
|
||||
}
|
||||
else if(file_format == FileFormat::GGJT_3)
|
||||
{
|
||||
n_vocab = llama_n_vocab(llama_ctx_v3);
|
||||
}
|
||||
else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2)
|
||||
{
|
||||
|
@ -602,6 +743,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
{
|
||||
n_vocab = gptj_ctx_v2.hparams.n_vocab;
|
||||
}
|
||||
else if(file_format==FileFormat::GPTJ_5)
|
||||
{
|
||||
n_vocab = gptj_ctx_v3.hparams.n_vocab;
|
||||
}
|
||||
else if(file_format == FileFormat::GPT2_1)
|
||||
{
|
||||
n_vocab = gpt2_ctx_v1.hparams.n_vocab;
|
||||
|
@ -610,9 +755,17 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
{
|
||||
n_vocab = gpt2_ctx_v2.hparams.n_vocab;
|
||||
}
|
||||
else if(file_format==FileFormat::GPT2_4)
|
||||
{
|
||||
n_vocab = gpt2_ctx_v3.hparams.n_vocab;
|
||||
}
|
||||
else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
|
||||
{
|
||||
n_vocab = neox_ctx.hparams.n_vocab;
|
||||
n_vocab = neox_ctx_v2.hparams.n_vocab;
|
||||
}
|
||||
else if( file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
||||
{
|
||||
n_vocab = neox_ctx_v3.hparams.n_vocab;
|
||||
}
|
||||
else if(file_format == FileFormat::RWKV_1)
|
||||
{
|
||||
|
@ -641,11 +794,18 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
if(debugmode)
|
||||
{
|
||||
printf("\n[Debug: Dump Input Tokens]\n");
|
||||
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
|
||||
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3)
|
||||
{
|
||||
for (auto id : embd_inp)
|
||||
{
|
||||
printf("'%s (%d)', ",llama_token_to_str(llama_ctx_v1, id),id);
|
||||
printf("'%s (%d)', ",llama_v2_token_to_str(llama_ctx_v2, id),id);
|
||||
}
|
||||
}
|
||||
else if (file_format == FileFormat::GGJT_3)
|
||||
{
|
||||
for (auto id : embd_inp)
|
||||
{
|
||||
printf("'%s (%d)', ",llama_token_to_str(llama_ctx_v3, id),id);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -680,7 +840,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
|
||||
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
|
||||
{
|
||||
evalres = (llama_eval(llama_ctx_v1, embd.data(), embdsize, n_past, params.n_threads)==0);
|
||||
evalres = (llama_v2_eval(llama_ctx_v2, embd.data(), embdsize, n_past, params.n_threads)==0);
|
||||
}
|
||||
else if(file_format == FileFormat::GGJT_3)
|
||||
{
|
||||
evalres = (llama_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0);
|
||||
}
|
||||
else if(file_format==FileFormat::RWKV_1)
|
||||
{
|
||||
|
@ -694,20 +858,37 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
}
|
||||
else if(file_format==FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3)
|
||||
{
|
||||
evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
|
||||
evalres = gpt2_v2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
|
||||
}
|
||||
else if(file_format==FileFormat::GPT2_4)
|
||||
{
|
||||
evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
|
||||
}
|
||||
else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5)
|
||||
{
|
||||
evalres = gpt_neox_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token);
|
||||
evalres = gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token);
|
||||
}
|
||||
else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
||||
{
|
||||
evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
|
||||
}
|
||||
else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2)
|
||||
{
|
||||
evalres = legacy_gptj_eval(gptj_ctx_v1, params.n_threads, n_past, embd, logits, mem_per_token, file_format);
|
||||
}
|
||||
else if(file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4)
|
||||
{
|
||||
evalres = gptj_v2_eval(gptj_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token);
|
||||
}
|
||||
else if(file_format==FileFormat::GPTJ_5)
|
||||
{
|
||||
evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token);
|
||||
}
|
||||
else
|
||||
{
|
||||
evalres = gptj_eval(gptj_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token);
|
||||
printf("\nCannot find eval function\n");
|
||||
}
|
||||
|
||||
if (!evalres)
|
||||
{
|
||||
fprintf(stderr, "Failed to predict\n");
|
||||
|
@ -739,9 +920,17 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
printf("\n");
|
||||
}
|
||||
|
||||
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
|
||||
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3)
|
||||
{
|
||||
auto logits = llama_get_logits(llama_ctx_v1);
|
||||
float * logits;
|
||||
if(file_format == FileFormat::GGJT_3)
|
||||
{
|
||||
logits = llama_get_logits(llama_ctx_v3);
|
||||
}
|
||||
else
|
||||
{
|
||||
logits = llama_v2_get_logits(llama_ctx_v2);
|
||||
}
|
||||
|
||||
if (!unbanTokens)
|
||||
{
|
||||
|
@ -765,10 +954,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
if ((file_format == FileFormat::GPT2_1 ||
|
||||
file_format == FileFormat::GPT2_2 ||
|
||||
file_format == FileFormat::GPT2_3 ||
|
||||
file_format == FileFormat::GPT2_4 ||
|
||||
file_format == FileFormat::GPTJ_1 ||
|
||||
file_format == FileFormat::GPTJ_2 ||
|
||||
file_format == FileFormat::GPTJ_3 ||
|
||||
file_format == FileFormat::GPTJ_4) &&
|
||||
file_format == FileFormat::GPTJ_4 ||
|
||||
file_format == FileFormat::GPTJ_5) &&
|
||||
logits.size() > 50256)
|
||||
{
|
||||
logits[50256] = (logits[50256] < 0 ? logits[50256] : 0);
|
||||
|
@ -793,7 +984,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
|
|||
|
||||
if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
|
||||
{
|
||||
concat_output += llama_token_to_str(llama_ctx_v1, id);
|
||||
if(file_format == FileFormat::GGJT_3)
|
||||
{
|
||||
concat_output += llama_token_to_str(llama_ctx_v3, id);
|
||||
}
|
||||
else
|
||||
{
|
||||
concat_output += llama_v2_token_to_str(llama_ctx_v2, id);
|
||||
}
|
||||
|
||||
if(unbanTokens && id==llama_token_eos())
|
||||
{
|
||||
printf("\n(EOS token triggered!)");
|
||||
|
|
|
@ -200,7 +200,7 @@ maxctx = 2048
|
|||
maxlen = 128
|
||||
modelbusy = False
|
||||
defaultport = 5001
|
||||
KcppVersion = "1.23.1"
|
||||
KcppVersion = "1.24"
|
||||
|
||||
class ServerRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||
sys_version = ""
|
||||
|
|
36
llama.cpp
36
llama.cpp
|
@ -11,6 +11,8 @@
|
|||
#include "ggml.h"
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
|
||||
#include <array>
|
||||
|
@ -91,7 +93,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
|||
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_7B, 768ull * MB },
|
||||
{ MODEL_7B, 800ull * MB },
|
||||
{ MODEL_13B, 1024ull * MB },
|
||||
{ MODEL_30B, 1280ull * MB },
|
||||
{ MODEL_65B, 1536ull * MB },
|
||||
|
@ -933,7 +935,7 @@ static void llama_model_load_internal(
|
|||
if (hparams.ftype != LLAMA_FTYPE_ALL_F32 &&
|
||||
hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 &&
|
||||
hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
|
||||
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
||||
printf("\nthis format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -941,7 +943,7 @@ static void llama_model_load_internal(
|
|||
if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
||||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
|
||||
hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
|
||||
throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
||||
printf("\nthis format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1039,7 +1041,7 @@ static void llama_model_load_internal(
|
|||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
||||
|
||||
model.mapping = std::move(ml->mapping);
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
{
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
|
@ -1065,6 +1067,32 @@ static void llama_model_load_internal(
|
|||
|
||||
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
}
|
||||
#elif defined(GGML_USE_CLBLAST)
|
||||
{
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||
|
||||
size_t vram_total = 0;
|
||||
|
||||
for (int i = 0; i < n_gpu; ++i) {
|
||||
const auto & layer = model.layers[i];
|
||||
|
||||
ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
||||
ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
||||
ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
||||
ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
||||
ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
||||
ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
||||
ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
||||
}
|
||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||
fprintf(stderr, "%s: [opencl] offloading output layer to GPU\n", __func__);
|
||||
ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
}
|
||||
#else
|
||||
(void) n_gpu_layers;
|
||||
#endif
|
||||
|
|
|
@ -112,7 +112,14 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
temp %= 1000;
|
||||
if (qntvr != 0)
|
||||
{
|
||||
fileformat = FileFormat::GPTJ_4;
|
||||
if (qntvr == 1)
|
||||
{
|
||||
fileformat = FileFormat::GPTJ_4;
|
||||
}
|
||||
else
|
||||
{
|
||||
fileformat = FileFormat::GPTJ_5;
|
||||
}
|
||||
}
|
||||
else if (temp != 0 && temp != 1)
|
||||
{
|
||||
|
@ -131,8 +138,15 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
const int32_t qntvr = temp / 1000;
|
||||
temp %= 1000;
|
||||
if (qntvr != 0)
|
||||
{
|
||||
fileformat = FileFormat::GPT2_3;
|
||||
{
|
||||
if (qntvr == 1)
|
||||
{
|
||||
fileformat = FileFormat::GPT2_3;
|
||||
}
|
||||
else
|
||||
{
|
||||
fileformat = FileFormat::GPT2_4;
|
||||
}
|
||||
}
|
||||
else if (temp != 0 && temp != 1)
|
||||
{
|
||||
|
@ -142,7 +156,7 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
else if(vocabsiz < 31998 || vocabsiz > 33000)
|
||||
{
|
||||
//anything outside the llama v1 range is assumed to be NeoX
|
||||
fileformat = FileFormat::NEOX_4;
|
||||
fileformat = FileFormat::NEOX_6;
|
||||
uint32_t temp,temp2;
|
||||
fin.read((char *)&temp, sizeof(temp)); //ctx
|
||||
fin.read((char *)&temp, sizeof(temp)); //n_embd
|
||||
|
@ -169,17 +183,21 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
if((temp==0||temp==1)&&(temp2==0||temp2==1))//special case: par_res and ftype are both 1 or 0
|
||||
{
|
||||
//its a f16/f32 model in the new format
|
||||
fileformat = FileFormat::NEOX_4;
|
||||
fileformat = temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
const int32_t qntvr = temp2 / 1000; //for future use
|
||||
//then temp was par_res
|
||||
if(temp==0) //use_parallel_residual is false in RedPajama
|
||||
//then temp was par_res, use_parallel_residual is false in RedPajama
|
||||
if(qntvr==1)
|
||||
{
|
||||
fileformat = FileFormat::NEOX_5;
|
||||
fileformat = (temp==0?FileFormat::NEOX_5:FileFormat::NEOX_4);
|
||||
}
|
||||
else
|
||||
{
|
||||
fileformat = (temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -197,7 +215,7 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
}
|
||||
else if(magic == 0x67676a74) //v3 format ggjt
|
||||
{
|
||||
fileformat = FileFormat::GGJT_2; //ggjt by default
|
||||
fileformat = FileFormat::GGJT_3; //ggjt by default
|
||||
uint32_t ver, temp, ftype;
|
||||
fin.read((char *)&ver, sizeof(ver)); //file version
|
||||
fin.read((char *)&temp, sizeof(temp));//vocab
|
||||
|
@ -212,6 +230,10 @@ void print_tok_vec(std::vector<float> &embd)
|
|||
{
|
||||
fileformat = FileFormat::GGJT;
|
||||
}
|
||||
else if(ver==2)
|
||||
{
|
||||
fileformat = FileFormat::GGJT_2;
|
||||
}
|
||||
}
|
||||
fin.close();
|
||||
|
||||
|
|
|
@ -20,15 +20,18 @@ enum FileFormat
|
|||
GGHF=2, // 2=(llama ggmf)
|
||||
GGJT=3, // 3=(llama ggjt)
|
||||
GGJT_2=4, //newer llama format unshuffled
|
||||
GGJT_3=5, //using 16bit scalar
|
||||
|
||||
GPTJ_1=100, //the very first super old GPTJ format
|
||||
GPTJ_2=101, //pygmalion, uses old ggml lib
|
||||
GPTJ_3=102, //uses new ggml lib
|
||||
GPTJ_4=103, //unshuffled
|
||||
GPTJ_5=104, //using 16bit scalar
|
||||
|
||||
GPT2_1=200,
|
||||
GPT2_2=201,
|
||||
GPT2_3=202, //unshuffled
|
||||
GPT2_4=203, //using 16bit scalar
|
||||
|
||||
RWKV_1=300,
|
||||
|
||||
|
@ -37,6 +40,8 @@ enum FileFormat
|
|||
NEOX_3=402, //redpajama
|
||||
NEOX_4=403, //unshuffled
|
||||
NEOX_5=404, //unshuffled redpajama
|
||||
NEOX_6=405, //using 16bit scalar
|
||||
NEOX_7=406, //using 16bit scalar redpajama
|
||||
};
|
||||
|
||||
enum ModelLoadResult
|
||||
|
|
|
@ -12,12 +12,12 @@
|
|||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml_v2.h"
|
||||
|
||||
#define CL_DMMV_BLOCK_SIZE 32;
|
||||
|
||||
#define MULTILINE_QUOTE(...) #__VA_ARGS__
|
||||
std::string program_source = MULTILINE_QUOTE(
|
||||
static std::string program_source = MULTILINE_QUOTE(
|
||||
|
||||
typedef char int8_t;
|
||||
typedef uchar uint8_t;
|
||||
|
@ -126,13 +126,13 @@ void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const in
|
|||
*v0 = vi0*d;
|
||||
*v1 = vi1*d;
|
||||
}
|
||||
void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
|
||||
static void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){
|
||||
*v0 = vload_half(0, &x[ib + 0]);
|
||||
*v1 = vload_half(0, &x[ib + 1]);
|
||||
}
|
||||
);
|
||||
|
||||
std::string dequant_template = MULTILINE_QUOTE(
|
||||
static std::string dequant_template = MULTILINE_QUOTE(
|
||||
__kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
||||
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2;
|
||||
|
||||
|
@ -156,7 +156,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) {
|
|||
}
|
||||
);
|
||||
|
||||
std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
||||
static std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE(
|
||||
__kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) {
|
||||
const int block_size = get_local_size(0);
|
||||
const int row = get_global_id(0) / block_size;
|
||||
|
@ -198,11 +198,11 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|||
}
|
||||
);
|
||||
|
||||
std::array<std::string, 5> dequant_str_keys = {
|
||||
static std::array<std::string, 5> dequant_str_keys = {
|
||||
"KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC"
|
||||
};
|
||||
|
||||
std::array<std::string, 30> dequant_str_values = {
|
||||
static std::array<std::string, 30> dequant_str_values = {
|
||||
"dequantize_row_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0",
|
||||
"dequantize_row_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1",
|
||||
"dequantize_row_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0",
|
||||
|
@ -211,7 +211,7 @@ std::array<std::string, 30> dequant_str_values = {
|
|||
"convert_row_f16", "half", "1", "1", "convert_f16"
|
||||
};
|
||||
|
||||
std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
|
||||
static std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
|
||||
"dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0",
|
||||
"dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1",
|
||||
"dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0",
|
||||
|
@ -220,7 +220,7 @@ std::array<std::string, 30> dequant_mul_mat_vec_str_values = {
|
|||
"convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16"
|
||||
};
|
||||
|
||||
static std::string& sreplace(std::string& s, const std::string& from, const std::string& to) {
|
||||
static std::string& sreplace2(std::string& s, const std::string& from, const std::string& to) {
|
||||
size_t pos = 0;
|
||||
while ((pos = s.find(from, pos)) != std::string::npos) {
|
||||
s.replace(pos, from.length(), to);
|
||||
|
@ -236,8 +236,8 @@ static std::string generate_kernels() {
|
|||
std::string dequant_kernel = dequant_template;
|
||||
std::string dmmv_kernel = dequant_mul_mat_vec_template;
|
||||
for (size_t j = 0; j < dequant_str_keys.size(); j++) {
|
||||
sreplace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
|
||||
sreplace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
|
||||
sreplace2(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]);
|
||||
sreplace2(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]);
|
||||
}
|
||||
src << dequant_kernel << '\n';
|
||||
src << dmmv_kernel << '\n';
|
||||
|
@ -296,13 +296,13 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
|
|||
return p;
|
||||
}
|
||||
|
||||
void ggml_cl_init(void) {
|
||||
void ggml_v2_cl_init(void) {
|
||||
cl_int err = 0;
|
||||
char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
|
||||
char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
|
||||
int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
|
||||
int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
|
||||
printf("\nInitializing CLBlast (First Run)...");
|
||||
char * GGML_V2_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
|
||||
char * GGML_V2_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
|
||||
int plat_num = (GGML_V2_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_V2_CLBLAST_PLATFORM));
|
||||
int dev_num = (GGML_V2_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_V2_CLBLAST_DEVICE));
|
||||
printf("\nInitializing LEGACY v2 CLBlast (First Run)...");
|
||||
printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
|
||||
cl_uint num_platforms;
|
||||
clGetPlatformIDs(0, NULL, &num_platforms);
|
||||
|
@ -376,7 +376,7 @@ void ggml_cl_init(void) {
|
|||
CL_CHECK(err, "clCreateKernel");
|
||||
}
|
||||
|
||||
static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
|
||||
static void ggml_v2_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
|
||||
if (req_size <= *cur_size) {
|
||||
return;
|
||||
}
|
||||
|
@ -391,38 +391,38 @@ static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags
|
|||
CL_CHECK(err, "clCreateBuffer");
|
||||
}
|
||||
|
||||
static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
|
||||
static cl_kernel* ggml_v2_get_to_fp32_cl(ggml_v2_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_V2_TYPE_Q4_0:
|
||||
return &dequantize_row_q4_0_cl;
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_V2_TYPE_Q4_1:
|
||||
return &dequantize_row_q4_1_cl;
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_V2_TYPE_Q5_0:
|
||||
return &dequantize_row_q5_0_cl;
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_V2_TYPE_Q5_1:
|
||||
return &dequantize_row_q5_1_cl;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_V2_TYPE_Q8_0:
|
||||
return &dequantize_row_q8_0_cl;
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_V2_TYPE_F16:
|
||||
return &convert_row_f16_cl;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) {
|
||||
static cl_kernel* ggml_v2_get_dequantize_mul_mat_vec_cl(ggml_v2_type type) {
|
||||
switch (type) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_V2_TYPE_Q4_0:
|
||||
return &dequantize_mul_mat_vec_q4_0_cl;
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_V2_TYPE_Q4_1:
|
||||
return &dequantize_mul_mat_vec_q4_1_cl;
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_V2_TYPE_Q5_0:
|
||||
return &dequantize_mul_mat_vec_q5_0_cl;
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_V2_TYPE_Q5_1:
|
||||
return &dequantize_mul_mat_vec_q5_1_cl;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_V2_TYPE_Q8_0:
|
||||
return &dequantize_mul_mat_vec_q8_0_cl;
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_V2_TYPE_F16:
|
||||
return &convert_mul_mat_vec_f16_cl;
|
||||
default:
|
||||
return nullptr;
|
||||
|
@ -454,7 +454,7 @@ struct cl_buffer {
|
|||
static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS];
|
||||
static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT;
|
||||
|
||||
static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
|
||||
static cl_mem ggml_v2_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) {
|
||||
scoped_spin_lock lock(g_cl_pool_lock);
|
||||
cl_int err;
|
||||
|
||||
|
@ -473,7 +473,7 @@ static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flag
|
|||
return mem;
|
||||
}
|
||||
|
||||
static void ggml_cl_pool_free(cl_mem mem, size_t size) {
|
||||
static void ggml_v2_cl_pool_free(cl_mem mem, size_t size) {
|
||||
scoped_spin_lock lock(g_cl_pool_lock);
|
||||
|
||||
for (int i = 0; i < MAX_CL_BUFFERS; ++i) {
|
||||
|
@ -488,7 +488,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) {
|
|||
clReleaseMemObject(mem);
|
||||
}
|
||||
|
||||
static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
|
||||
static cl_int ggml_v2_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_v2_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) {
|
||||
cl_int err;
|
||||
const uint64_t ne0 = src->ne[0];
|
||||
const uint64_t ne1 = src->ne[1];
|
||||
|
@ -496,9 +496,9 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|||
const uint64_t nb1 = src->nb[1];
|
||||
const uint64_t nb2 = src->nb[2];
|
||||
const uint64_t nb3 = src->nb[3];
|
||||
const enum ggml_type type = src->type;
|
||||
const size_t ts = ggml_type_size(type);
|
||||
const size_t bs = ggml_blck_size(type);
|
||||
const enum ggml_v2_type type = src->type;
|
||||
const size_t ts = ggml_v2_type_size(type);
|
||||
const size_t bs = ggml_v2_blck_size(type);
|
||||
|
||||
const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
|
||||
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
||||
|
@ -525,7 +525,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o
|
|||
return err;
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
static void ggml_v2_cl_mul_mat_f32(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst) {
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
@ -544,18 +544,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
const int d_ne = ne11 * ne01;
|
||||
|
||||
size_t x_size, y_size, d_size;
|
||||
cl_mem d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
||||
cl_mem d_X = ggml_v2_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_Y = ggml_v2_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_D = ggml_v2_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
||||
|
||||
cl_int err;
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
// copy data to device
|
||||
err = ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL);
|
||||
err |= ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL);
|
||||
CL_CHECK(err, "ggml_cl_h2d_tensor_2d");
|
||||
err = ggml_v2_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL);
|
||||
err |= ggml_v2_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL);
|
||||
CL_CHECK(err, "ggml_v2_cl_h2d_tensor_2d");
|
||||
|
||||
CL_CHECK(clFinish(queue), "clFinish");
|
||||
|
||||
|
@ -574,7 +574,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
printf("\nF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
|
||||
GGML_ASSERT(false);
|
||||
GGML_V2_ASSERT(false);
|
||||
}
|
||||
|
||||
// copy dst to host
|
||||
|
@ -584,13 +584,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
}
|
||||
}
|
||||
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
ggml_cl_pool_free(d_D, d_size);
|
||||
ggml_v2_cl_pool_free(d_X, x_size);
|
||||
ggml_v2_cl_pool_free(d_Y, y_size);
|
||||
ggml_v2_cl_pool_free(d_D, d_size);
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
|
||||
GGML_ASSERT(fp16_support);
|
||||
static void ggml_v2_cl_mul_mat_f16(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst, void * wdata, size_t /* wsize */) {
|
||||
GGML_V2_ASSERT(fp16_support);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
|
@ -608,16 +608,16 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
|
||||
const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f);
|
||||
const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f);
|
||||
const ggml_v2_fp16_t alpha = ggml_v2_fp32_to_fp16(1.0f);
|
||||
const ggml_v2_fp16_t beta = ggml_v2_fp32_to_fp16(0.0f);
|
||||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = ne11 * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
|
||||
size_t x_size, y_size, d_size;
|
||||
cl_mem d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
||||
cl_mem d_X = ggml_v2_cl_pool_malloc(sizeof(ggml_v2_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_Y = ggml_v2_cl_pool_malloc(sizeof(ggml_v2_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_D = ggml_v2_cl_pool_malloc(sizeof(ggml_v2_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
||||
|
||||
cl_int err;
|
||||
|
||||
|
@ -627,20 +627,20 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
// copy src0 to device
|
||||
err = ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL);
|
||||
CL_CHECK(err, "ggml_cl_h2d_tensor_2d");
|
||||
err = ggml_v2_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL);
|
||||
CL_CHECK(err, "ggml_v2_cl_h2d_tensor_2d");
|
||||
|
||||
// convert src1 to fp16
|
||||
// TODO: use multiple threads
|
||||
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
|
||||
ggml_v2_fp16_t * const tmp = (ggml_v2_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
|
||||
char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
|
||||
if (src1_cont_rows) {
|
||||
if (src1_cont_cols) {
|
||||
ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
||||
ggml_v2_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
|
||||
}
|
||||
else {
|
||||
for (int64_t i01 = 0; i01 < ne11; i01++) {
|
||||
ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
|
||||
ggml_v2_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -648,14 +648,14 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
for (int64_t i01 = 0; i01 < ne11; i01++) {
|
||||
for (int64_t i00 = 0; i00 < ne10; i00++) {
|
||||
// very slow due to no inlining
|
||||
tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
|
||||
tmp[i01*ne10 + i00] = ggml_v2_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// copy src1 to device
|
||||
err |= clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL);
|
||||
CL_CHECK(err, "ggml_cl_h2d_tensor_2d");
|
||||
err |= clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_v2_fp16_t) * y_ne, tmp, 0, NULL, NULL);
|
||||
CL_CHECK(err, "ggml_v2_cl_h2d_tensor_2d");
|
||||
|
||||
CL_CHECK(clFinish(queue), "clFinish");
|
||||
|
||||
|
@ -673,24 +673,24 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
printf("\nF16 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
|
||||
GGML_ASSERT(false);
|
||||
GGML_V2_ASSERT(false);
|
||||
}
|
||||
|
||||
// copy dst to host, then convert to float
|
||||
err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL);
|
||||
err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_v2_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL);
|
||||
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
|
||||
ggml_fp16_to_fp32_row(tmp, d, d_ne);
|
||||
ggml_v2_fp16_to_fp32_row(tmp, d, d_ne);
|
||||
}
|
||||
}
|
||||
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
ggml_cl_pool_free(d_D, d_size);
|
||||
ggml_v2_cl_pool_free(d_X, x_size);
|
||||
ggml_v2_cl_pool_free(d_Y, y_size);
|
||||
ggml_v2_cl_pool_free(d_D, d_size);
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
static void ggml_v2_cl_mul_mat_q_f32(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst) {
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
@ -701,7 +701,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|||
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
const ggml_type type = src0->type;
|
||||
const ggml_v2_type type = src0->type;
|
||||
const bool mul_mat_vec = ne11 == 1;
|
||||
|
||||
const float alpha = 1.0f;
|
||||
|
@ -709,39 +709,39 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|||
const int x_ne = ne01 * ne00;
|
||||
const int y_ne = ne11 * ne10;
|
||||
const int d_ne = ne11 * ne01;
|
||||
const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
|
||||
const size_t q_sz = ggml_v2_type_size(type) * x_ne / ggml_v2_blck_size(type);
|
||||
|
||||
size_t x_size, y_size, d_size, q_size;
|
||||
cl_mem d_X;
|
||||
if (!mul_mat_vec) {
|
||||
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
|
||||
d_X = ggml_v2_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE);
|
||||
}
|
||||
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
||||
cl_mem d_Y = ggml_v2_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY);
|
||||
cl_mem d_D = ggml_v2_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY);
|
||||
cl_mem d_Q;
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
||||
if (src0->backend == GGML_V2_BACKEND_CPU) {
|
||||
d_Q = ggml_v2_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
||||
}
|
||||
|
||||
cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type);
|
||||
cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type);
|
||||
GGML_ASSERT(to_fp32_cl != nullptr);
|
||||
cl_kernel* to_fp32_cl = ggml_v2_get_to_fp32_cl(type);
|
||||
cl_kernel* dmmv = ggml_v2_get_dequantize_mul_mat_vec_cl(type);
|
||||
GGML_V2_ASSERT(to_fp32_cl != nullptr);
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
cl_event ev_sgemm;
|
||||
|
||||
// copy src0 to device if necessary
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL), "ggml_cl_h2d_tensor_2d");
|
||||
} else if (src0->backend == GGML_BACKEND_CL) {
|
||||
if (src0->backend == GGML_V2_BACKEND_CPU) {
|
||||
CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL), "ggml_v2_cl_h2d_tensor_2d");
|
||||
} else if (src0->backend == GGML_V2_BACKEND_CL) {
|
||||
d_Q = *(cl_mem*) src0->data;
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_V2_ASSERT(false);
|
||||
}
|
||||
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
|
||||
// copy src1 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d");
|
||||
CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_v2_cl_h2d_tensor_2d");
|
||||
|
||||
// compute
|
||||
const size_t global = ne01 * CL_DMMV_BLOCK_SIZE;
|
||||
|
@ -763,7 +763,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|||
CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL), "clEnqueueNDRangeKernel");
|
||||
|
||||
// copy src1 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d");
|
||||
CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_v2_cl_h2d_tensor_2d");
|
||||
|
||||
// wait for conversion
|
||||
CL_CHECK(clFinish(queue), "clFinish");
|
||||
|
@ -781,7 +781,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|||
|
||||
if (status != clblast::StatusCode::kSuccess) {
|
||||
printf("\nQF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status);
|
||||
GGML_ASSERT(false);
|
||||
GGML_V2_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -793,146 +793,146 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|||
}
|
||||
|
||||
if (!mul_mat_vec) {
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
ggml_v2_cl_pool_free(d_X, x_size);
|
||||
}
|
||||
ggml_cl_pool_free(d_Y, y_size);
|
||||
ggml_cl_pool_free(d_D, d_size);
|
||||
if (src0->backend == GGML_BACKEND_CPU) {
|
||||
ggml_cl_pool_free(d_Q, q_size);
|
||||
ggml_v2_cl_pool_free(d_Y, y_size);
|
||||
ggml_v2_cl_pool_free(d_D, d_size);
|
||||
if (src0->backend == GGML_V2_BACKEND_CPU) {
|
||||
ggml_v2_cl_pool_free(d_Q, q_size);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
bool ggml_v2_cl_can_mul_mat(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst) {
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
|
||||
// TODO: find the optimal values for these
|
||||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||
src1->type == GGML_TYPE_F32 &&
|
||||
dst->type == GGML_TYPE_F32 &&
|
||||
((GetQuantsUnshuffled() && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
|
||||
if ((src0->type == GGML_V2_TYPE_F32 || src0->type == GGML_V2_TYPE_F16 || ggml_v2_is_quantized(src0->type)) &&
|
||||
src1->type == GGML_V2_TYPE_F32 &&
|
||||
dst->type == GGML_V2_TYPE_F32 &&
|
||||
((GetQuantsUnshuffled() && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_V2_BACKEND_CL)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
|
||||
bool ggml_v2_cl_mul_mat_use_f16(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * /* dst */) {
|
||||
// If device doesn't support FP16
|
||||
if (!fp16_support) {
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t src0_sz = ggml_nbytes(src0);
|
||||
size_t src1_sz = ggml_nbytes(src1);
|
||||
size_t src0_sz = ggml_v2_nbytes(src0);
|
||||
size_t src1_sz = ggml_v2_nbytes(src1);
|
||||
|
||||
// mul_mat_q: src0 is converted to fp32 on device
|
||||
size_t mul_mat_q_transfer = src0_sz + src1_sz;
|
||||
|
||||
// mul_mat_f16: src1 is converted to fp16 on cpu
|
||||
size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1);
|
||||
size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_v2_fp16_t) * ggml_v2_nelements(src1);
|
||||
|
||||
// choose the smaller one to transfer to the device
|
||||
// TODO: this is not always the best choice due to the overhead of converting to fp16
|
||||
return mul_mat_f16_transfer < mul_mat_q_transfer;
|
||||
}
|
||||
|
||||
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
|
||||
GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst));
|
||||
void ggml_v2_cl_mul_mat(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst, void * wdata, size_t wsize) {
|
||||
GGML_V2_ASSERT(ggml_v2_cl_can_mul_mat(src0, src1, dst));
|
||||
|
||||
if (src0->type == GGML_TYPE_F32) {
|
||||
ggml_cl_mul_mat_f32(src0, src1, dst);
|
||||
if (src0->type == GGML_V2_TYPE_F32) {
|
||||
ggml_v2_cl_mul_mat_f32(src0, src1, dst);
|
||||
}
|
||||
else if (src0->type == GGML_TYPE_F16) {
|
||||
if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
|
||||
ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
|
||||
else if (src0->type == GGML_V2_TYPE_F16) {
|
||||
if (ggml_v2_cl_mul_mat_use_f16(src0, src1, dst)) {
|
||||
ggml_v2_cl_mul_mat_f16(src0, src1, dst, wdata, wsize);
|
||||
}
|
||||
else {
|
||||
ggml_cl_mul_mat_q_f32(src0, src1, dst);
|
||||
ggml_v2_cl_mul_mat_q_f32(src0, src1, dst);
|
||||
}
|
||||
}
|
||||
else if (ggml_is_quantized(src0->type)) {
|
||||
ggml_cl_mul_mat_q_f32(src0, src1, dst);
|
||||
else if (ggml_v2_is_quantized(src0->type)) {
|
||||
ggml_v2_cl_mul_mat_q_f32(src0, src1, dst);
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(false);
|
||||
GGML_V2_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) {
|
||||
return ggml_nelements(src1) * sizeof(ggml_fp16_t);
|
||||
size_t ggml_v2_cl_mul_mat_get_wsize(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst) {
|
||||
if (ggml_v2_cl_mul_mat_use_f16(src0, src1, dst)) {
|
||||
return ggml_v2_nelements(src1) * sizeof(ggml_v2_fp16_t);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void ggml_cl_transform_tensor(ggml_tensor * tensor) {
|
||||
void ggml_v2_cl_transform_tensor(ggml_v2_tensor * tensor) {
|
||||
const int64_t ne0 = tensor->ne[0];
|
||||
const int64_t ne1 = tensor->ne[1];
|
||||
const int64_t ne2 = tensor->ne[2];
|
||||
const int64_t ne3 = tensor->ne[3];
|
||||
|
||||
const ggml_type type = tensor->type;
|
||||
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
|
||||
const ggml_v2_type type = tensor->type;
|
||||
const size_t q_sz = ggml_v2_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_v2_blck_size(type);
|
||||
|
||||
size_t q_size;
|
||||
cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem));
|
||||
*dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
||||
*dst = ggml_v2_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY);
|
||||
|
||||
// copy tensor to device
|
||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
||||
int i = i3*ne2 + i2;
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL), "ggml_cl_h2d_tensor_2d");
|
||||
CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL), "ggml_v2_cl_h2d_tensor_2d");
|
||||
}
|
||||
}
|
||||
|
||||
CL_CHECK(clFinish(queue), "clFinish");
|
||||
|
||||
tensor->data = dst;
|
||||
tensor->backend = GGML_BACKEND_CL;
|
||||
tensor->backend = GGML_V2_BACKEND_CL;
|
||||
}
|
||||
|
||||
void ggml_cl_sgemm_wrapper(
|
||||
const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
|
||||
void ggml_v2_cl_sgemm_wrapper(
|
||||
const enum ggml_v2_blas_order order, const enum ggml_v2_blas_op trans_a, const enum ggml_v2_blas_op trans_b,
|
||||
const int m, const int n, const int k,
|
||||
const float alpha, const void *host_a, const int lda,
|
||||
const float *host_b, const int ldb, const float beta,
|
||||
float *host_c, const int ldc, const int btype) {
|
||||
cl_int err = 0;
|
||||
|
||||
cl_kernel * kernel = ggml_get_to_fp32_cl((ggml_type)btype);
|
||||
cl_kernel * kernel = ggml_v2_get_to_fp32_cl((ggml_v2_type)btype);
|
||||
size_t global = n * k, local, size_qb;
|
||||
bool dequant;
|
||||
|
||||
switch (btype) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_V2_TYPE_F32:
|
||||
dequant = false;
|
||||
break;
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_V2_TYPE_Q4_0:
|
||||
dequant = true;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(float) + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
case GGML_V2_TYPE_Q4_1:
|
||||
dequant = true;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(float) * 2 + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
case GGML_V2_TYPE_Q5_0:
|
||||
dequant = true;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
|
||||
size_qb = global * (sizeof(ggml_v2_fp16_t) + sizeof(uint32_t) + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
case GGML_V2_TYPE_Q5_1:
|
||||
dequant = true;
|
||||
local = 16;
|
||||
size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
|
||||
size_qb = global * (sizeof(ggml_v2_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_V2_TYPE_Q8_0:
|
||||
dequant = true;
|
||||
local = 32;
|
||||
size_qb = global * (sizeof(float) + local) / 32;
|
||||
|
@ -947,12 +947,12 @@ void ggml_cl_sgemm_wrapper(
|
|||
const size_t size_c = m * n * sizeof(float);
|
||||
|
||||
// Prepare buffers
|
||||
ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
|
||||
ggml_v2_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
|
||||
if (dequant) {
|
||||
ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
|
||||
ggml_v2_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
|
||||
}
|
||||
ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
|
||||
ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
|
||||
ggml_v2_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
|
||||
ggml_v2_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
|
||||
|
||||
cl_event ev_a, ev_qb, ev_b;
|
||||
|
||||
|
|
|
@ -1463,7 +1463,7 @@ static void ggml_v2_vec_dot_q5_0_q8_0(const int n, float * restrict s, const voi
|
|||
static void ggml_v2_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
static void ggml_v2_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
||||
|
||||
static const quantize_fns_t quantize_fns[GGML_V2_TYPE_COUNT] = {
|
||||
static const quantize_fns_t2 quantize_fns[GGML_V2_TYPE_COUNT] = {
|
||||
[GGML_V2_TYPE_Q4_0] = {
|
||||
.dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0,
|
||||
.quantize_row_q = quantize_row_q4_0,
|
||||
|
@ -1515,14 +1515,14 @@ static const quantize_fns_t quantize_fns[GGML_V2_TYPE_COUNT] = {
|
|||
};
|
||||
|
||||
// For internal test use
|
||||
quantize_fns_t ggml_v2_internal_get_quantize_fn(size_t i) {
|
||||
quantize_fns_t2 ggml_v2_internal_get_quantize_fn(size_t i) {
|
||||
GGML_V2_ASSERT(i < GGML_V2_TYPE_COUNT);
|
||||
return quantize_fns[i];
|
||||
}
|
||||
|
||||
bool quants_unshuffled = false; //new GGJT_2 is unshuffled, all old ones are shuffled
|
||||
static const quantize_fns_t quantize_fns_v2[GGML_V2_TYPE_COUNT]; //forward decl
|
||||
static inline quantize_fns_t get_quantize_fn(size_t i)
|
||||
static const quantize_fns_t2 quantize_fns_v2[GGML_V2_TYPE_COUNT]; //forward decl
|
||||
static inline quantize_fns_t2 get_quantize_fn(size_t i)
|
||||
{
|
||||
return(quants_unshuffled?quantize_fns[i]:quantize_fns_v2[i]);
|
||||
}
|
||||
|
@ -17186,7 +17186,7 @@ inline bool GetQuantsUnshuffled()
|
|||
}
|
||||
|
||||
//TODO: integrate backwards compat
|
||||
static const quantize_fns_t quantize_fns_v2[GGML_V2_TYPE_COUNT] = {
|
||||
static const quantize_fns_t2 quantize_fns_v2[GGML_V2_TYPE_COUNT] = {
|
||||
[GGML_V2_TYPE_Q4_0] = {
|
||||
.dequantize_row_q = dequantize_row_q4_0_v2,
|
||||
.quantize_row_q = quantize_row_q4_0_v2,
|
||||
|
|
|
@ -1134,9 +1134,9 @@ extern "C" {
|
|||
quantize_row_q_t quantize_row_q_dot;
|
||||
vec_dot_q_t vec_dot_q;
|
||||
enum ggml_v2_type vec_dot_type;
|
||||
} quantize_fns_t;
|
||||
} quantize_fns_t2;
|
||||
|
||||
quantize_fns_t ggml_v2_internal_get_quantize_fn(size_t i);
|
||||
quantize_fns_t2 ggml_v2_internal_get_quantize_fn(size_t i);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -15,12 +15,9 @@
|
|||
|
||||
#include "model_adapter.h"
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
#include "ggml_v2-opencl.h"
|
||||
#endif
|
||||
|
||||
// load the model's weights from a file
|
||||
ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
|
||||
ModelLoadResult gpt2_v2_model_load(const std::string & fname, gpt2_v2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
|
||||
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
|
@ -379,8 +376,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
|||
// - embd_inp: the embeddings of the tokens in the context
|
||||
// - embd_w: the predicted logits for the next token
|
||||
//
|
||||
bool gpt2_eval(
|
||||
const gpt2_model & model,
|
||||
bool gpt2_v2_eval(
|
||||
const gpt2_v2_model & model,
|
||||
const int n_threads,
|
||||
const int n_past,
|
||||
const std::vector<gpt_vocab::id> & embd_inp,
|
||||
|
|
696
otherarch/gpt2_v3.cpp
Normal file
696
otherarch/gpt2_v3.cpp
Normal file
|
@ -0,0 +1,696 @@
|
|||
#include "ggml.h"
|
||||
#include "otherarch.h"
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "model_adapter.h"
|
||||
|
||||
#if defined(GGML_USE_CLBLAST)
|
||||
#include "ggml-opencl.h"
|
||||
#endif
|
||||
|
||||
// load the model's weights from a file
|
||||
ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
|
||||
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
// verify magic
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 0x67676d6c) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
// load hparams
|
||||
{
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
||||
printf("%s: qntvr = %d\n", __func__, qntvr);
|
||||
|
||||
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
||||
}
|
||||
|
||||
// load vocab
|
||||
{
|
||||
int32_t n_vocab = 0;
|
||||
fin.read((char *) &n_vocab, sizeof(n_vocab));
|
||||
|
||||
if (n_vocab != model.hparams.n_vocab) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
||||
__func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
std::string word;
|
||||
for (int i = 0; i < n_vocab; i++) {
|
||||
uint32_t len;
|
||||
fin.read((char *) &len, sizeof(len));
|
||||
|
||||
word.resize(len);
|
||||
fin.read((char *) word.data(), len);
|
||||
|
||||
vocab.token_to_id[word] = i;
|
||||
vocab.id_to_token[i] = word;
|
||||
}
|
||||
}
|
||||
|
||||
auto memory_type = GGML_TYPE_F16;
|
||||
|
||||
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||
// in order to save memory and also to speed up the computation
|
||||
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
||||
if (wtype == GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
||||
__func__, fname.c_str(), model.hparams.ftype);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
auto & ctx = model.ctx;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
||||
|
||||
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
|
||||
ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
|
||||
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
|
||||
|
||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
|
||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
|
||||
|
||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
|
||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
|
||||
|
||||
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
|
||||
ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
|
||||
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
|
||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
|
||||
|
||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
|
||||
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
|
||||
|
||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
||||
|
||||
ctx_size += 1.5*(n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // memory_k
|
||||
ctx_size += 1.5*(n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // memory_v
|
||||
|
||||
ctx_size += (6 + 12*n_layer)*512; // object overhead
|
||||
|
||||
printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
|
||||
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||
}
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = ctx_size;
|
||||
params.mem_buffer = NULL;
|
||||
params.no_alloc = false;
|
||||
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
if (!model.ctx) {
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
// prepare memory for the weights
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||
model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
|
||||
model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||
|
||||
// map by name
|
||||
model.tensors["model/ln_f/g"] = model.ln_f_g;
|
||||
model.tensors["model/ln_f/b"] = model.ln_f_b;
|
||||
|
||||
model.tensors["model/wte"] = model.wte;
|
||||
model.tensors["model/wpe"] = model.wpe;
|
||||
model.tensors["model/lm_head"] = model.lm_head;
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
|
||||
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
|
||||
|
||||
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
|
||||
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
|
||||
|
||||
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
||||
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
// map by name
|
||||
model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
|
||||
model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
|
||||
model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
|
||||
model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
|
||||
model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
|
||||
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
|
||||
|
||||
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
|
||||
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
|
||||
}
|
||||
}
|
||||
|
||||
// key + value memory
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
|
||||
const int n_mem = n_layer*n_ctx;
|
||||
const int n_elements = n_embd*n_mem;
|
||||
|
||||
model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements*1.5);
|
||||
model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements*1.5);
|
||||
|
||||
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
||||
|
||||
printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
||||
}
|
||||
|
||||
// load weights
|
||||
{
|
||||
size_t total_size = 0;
|
||||
|
||||
bool has_lm_head = false;
|
||||
|
||||
while (true) {
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
int32_t ttype;
|
||||
|
||||
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
||||
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t nelements = 1;
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
nelements *= ne[i];
|
||||
}
|
||||
|
||||
std::string name(length, 0);
|
||||
fin.read(&name[0], length);
|
||||
|
||||
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
auto tensor = model.tensors[name.data()];
|
||||
if (ggml_nelements(tensor) != nelements) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n",
|
||||
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
// for debugging
|
||||
if (0) {
|
||||
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
||||
|
||||
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
||||
|
||||
// GPT-2 models share the WTE tensor as the LM head
|
||||
if (name == "model/wte" && has_lm_head == false) {
|
||||
memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
if (name == "model/lm_head") {
|
||||
has_lm_head = true;
|
||||
}
|
||||
|
||||
total_size += ggml_nbytes(tensor);
|
||||
}
|
||||
|
||||
printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
|
||||
}
|
||||
|
||||
fin.close();
|
||||
|
||||
|
||||
// //gpu offload for gpt2
|
||||
// #if defined(GGML_USE_CLBLAST)
|
||||
// if(gpulayers>0)
|
||||
// {
|
||||
// const auto & hparams = model.hparams;
|
||||
// const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
|
||||
// if(GetQuantsUnshuffled())
|
||||
// {
|
||||
// SetGPULayers(n_gpu);
|
||||
|
||||
// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||
|
||||
// size_t vram_total = 0;
|
||||
|
||||
// for (int i = 0; i < n_gpu; ++i) {
|
||||
// const auto & layer = model.layers[i];
|
||||
|
||||
// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g);
|
||||
// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b);
|
||||
// ggml_cl_transform_tensor(layer.ln_2_g); vram_total += ggml_nbytes(layer.ln_2_g);
|
||||
// ggml_cl_transform_tensor(layer.ln_2_b); vram_total += ggml_nbytes(layer.ln_2_b);
|
||||
// ggml_cl_transform_tensor(layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
|
||||
// ggml_cl_transform_tensor(layer.c_attn_attn_b); vram_total += ggml_nbytes(layer.c_attn_attn_b);
|
||||
// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
||||
// ggml_cl_transform_tensor(layer.c_attn_proj_b); vram_total += ggml_nbytes(layer.c_attn_proj_b);
|
||||
// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
||||
// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b);
|
||||
// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
||||
// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b);
|
||||
// }
|
||||
|
||||
// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// if(n_gpu>0)
|
||||
// {
|
||||
// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n");
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// #endif
|
||||
|
||||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
|
||||
// evaluate the transformer
|
||||
//
|
||||
// - model: the model
|
||||
// - n_threads: number of threads to use
|
||||
// - n_past: the context size so far
|
||||
// - embd_inp: the embeddings of the tokens in the context
|
||||
// - embd_w: the predicted logits for the next token
|
||||
//
|
||||
bool gpt2_eval(
|
||||
const gpt2_model & model,
|
||||
const int n_threads,
|
||||
const int n_past,
|
||||
const std::vector<gpt_vocab::id> & embd_inp,
|
||||
std::vector<float> & embd_w,
|
||||
size_t & mem_per_token,
|
||||
FileFormat file_format) {
|
||||
const int N = embd_inp.size();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_head = hparams.n_head;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
static size_t buf_size = 256u*1024*1024;
|
||||
static void * buf = malloc(buf_size);
|
||||
|
||||
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
|
||||
const size_t buf_size_new = 320u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||
|
||||
// reallocate
|
||||
if (buf_size_new > buf_size)
|
||||
{
|
||||
buf_size = buf_size_new;
|
||||
buf = realloc(buf, buf_size);
|
||||
if (buf == nullptr)
|
||||
{
|
||||
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = buf_size;
|
||||
params.mem_buffer = buf;
|
||||
params.no_alloc = false;
|
||||
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
struct ggml_cgraph gf = {};
|
||||
gf.n_threads = n_threads;
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||
|
||||
struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
for (int i = 0; i < N; ++i) {
|
||||
((int32_t *) position->data)[i] = n_past + i;
|
||||
}
|
||||
|
||||
// wte + wpe
|
||||
struct ggml_tensor * inpL =
|
||||
ggml_add(ctx0,
|
||||
ggml_get_rows(ctx0, model.wte, embd),
|
||||
ggml_get_rows(ctx0, model.wpe, position));
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
// norm
|
||||
{
|
||||
// [ 768, N]
|
||||
cur = ggml_norm(ctx0, inpL);
|
||||
|
||||
// cur = ln_1_g*cur + ln_1_b
|
||||
// [ 768, N]
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
|
||||
cur),
|
||||
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
|
||||
}
|
||||
|
||||
// attn
|
||||
// [2304, 768] - model.layers[il].c_attn_attn_w
|
||||
// [2304, 1] - model.layers[il].c_attn_attn_b
|
||||
// [ 768, N] - cur (in)
|
||||
// [2304, N] - cur (out)
|
||||
//
|
||||
// cur = attn_w*cur + attn_b
|
||||
// [2304, N]
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].c_attn_attn_w,
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
|
||||
cur);
|
||||
}
|
||||
|
||||
// self-attention
|
||||
{
|
||||
struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
|
||||
struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
|
||||
struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
|
||||
|
||||
// store key and value to memory
|
||||
if (N >= 1) {
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
|
||||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||
}
|
||||
|
||||
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
||||
// [64, N, 12]
|
||||
struct ggml_tensor * Q =
|
||||
ggml_permute(ctx0,
|
||||
ggml_cpy(ctx0,
|
||||
Qcur,
|
||||
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
||||
0, 2, 1, 3);
|
||||
|
||||
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
||||
// [64, n_past + N, 12]
|
||||
struct ggml_tensor * K =
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
0, 2, 1, 3);
|
||||
|
||||
// GG: flash attention
|
||||
//struct ggml_tensor * V =
|
||||
// ggml_cpy(ctx0,
|
||||
// ggml_permute(ctx0,
|
||||
// ggml_reshape_3d(ctx0,
|
||||
// ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
||||
// n_embd/n_head, n_head, n_past + N),
|
||||
// 1, 2, 0, 3),
|
||||
// ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
|
||||
|
||||
//struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
|
||||
|
||||
// K * Q
|
||||
// [n_past + N, N, 12]
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
|
||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||
// [n_past + N, N, 12]
|
||||
struct ggml_tensor * KQ_scaled =
|
||||
ggml_scale_inplace(ctx0,
|
||||
KQ,
|
||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
||||
);
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
// [n_past + N, N, 12]
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||
|
||||
// KQ = soft_max(KQ_masked)
|
||||
// [n_past + N, N, 12]
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||
|
||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||
// [n_past + N, 64, 12]
|
||||
struct ggml_tensor * V_trans =
|
||||
ggml_cpy(ctx0,
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
1, 2, 0, 3),
|
||||
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
|
||||
|
||||
// KQV = transpose(V) * KQ_soft_max
|
||||
// [64, N, 12]
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
||||
|
||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||
// [64, 12, N]
|
||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
|
||||
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||
// [768, N]
|
||||
cur = ggml_cpy(ctx0,
|
||||
KQV_merged,
|
||||
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||
}
|
||||
|
||||
// projection
|
||||
// [ 768, 768] - model.layers[il].c_attn_proj_w
|
||||
// [ 768, 1] - model.layers[il].c_attn_proj_b
|
||||
// [ 768, N] - cur (in)
|
||||
// [ 768, N] - cur (out)
|
||||
//
|
||||
// cur = proj_w*cur + proj_b
|
||||
// [768, N]
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].c_attn_proj_w,
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
|
||||
cur);
|
||||
}
|
||||
|
||||
// add the input
|
||||
cur = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
struct ggml_tensor * inpFF = cur;
|
||||
|
||||
// feed-forward network
|
||||
{
|
||||
// norm
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpFF);
|
||||
|
||||
// cur = ln_2_g*cur + ln_2_b
|
||||
// [ 768, N]
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
|
||||
cur),
|
||||
ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
|
||||
}
|
||||
|
||||
// fully connected
|
||||
// [3072, 768] - model.layers[il].c_mlp_fc_w
|
||||
// [3072, 1] - model.layers[il].c_mlp_fc_b
|
||||
// [ 768, N] - cur (in)
|
||||
// [3072, N] - cur (out)
|
||||
//
|
||||
// cur = fc_w*cur + fc_b
|
||||
// [3072, N]
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].c_mlp_fc_w,
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
|
||||
cur);
|
||||
|
||||
// GELU activation
|
||||
// [3072, N]
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
// projection
|
||||
// [ 768, 3072] - model.layers[il].c_mlp_proj_w
|
||||
// [ 768, 1] - model.layers[il].c_mlp_proj_b
|
||||
// [3072, N] - cur (in)
|
||||
// [ 768, N] - cur (out)
|
||||
//
|
||||
// cur = proj_w*cur + proj_b
|
||||
// [768, N]
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].c_mlp_proj_w,
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
|
||||
cur);
|
||||
}
|
||||
|
||||
// input for next layer
|
||||
inpL = ggml_add(ctx0, cur, inpFF);
|
||||
}
|
||||
|
||||
// norm
|
||||
{
|
||||
// [ 768, N]
|
||||
inpL = ggml_norm(ctx0, inpL);
|
||||
|
||||
// inpL = ln_f_g*inpL + ln_f_b
|
||||
// [ 768, N]
|
||||
inpL = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.ln_f_g, inpL),
|
||||
inpL),
|
||||
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
||||
}
|
||||
|
||||
// inpL = WTE * inpL
|
||||
// [ 768, 50257] - model.lm_head
|
||||
// [ 768, N] - inpL
|
||||
inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
|
||||
|
||||
// logits -> probs
|
||||
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
||||
|
||||
// run the computation
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
ggml_graph_compute (ctx0, &gf);
|
||||
|
||||
//if (n_past%100 == 0) {
|
||||
// ggml_graph_print (&gf);
|
||||
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
||||
//}
|
||||
|
||||
//embd_w.resize(n_vocab*N);
|
||||
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||
|
||||
// return result just for the last token
|
||||
embd_w.resize(n_vocab);
|
||||
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||
|
||||
if (mem_per_token == 0) {
|
||||
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||
}
|
||||
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
return true;
|
||||
}
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
|
||||
// load the model's weights from a file
|
||||
ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1 & model, gpt_vocab & vocab, FileFormat file_format) {
|
||||
ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_v1_model & model, gpt_vocab & vocab, FileFormat file_format) {
|
||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
|
||||
bool super_old_format = (file_format==FileFormat::GPTJ_1);
|
||||
|
@ -363,7 +363,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1
|
|||
// The GPT-J model requires about 16MB of memory per input token.
|
||||
//
|
||||
bool legacy_gptj_eval(
|
||||
const gptj_model_v1 & model,
|
||||
const gptj_v1_model & model,
|
||||
const int n_threads,
|
||||
const int n_past,
|
||||
const std::vector<gpt_vocab::id> & embd_inp,
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
|
||||
|
||||
// load the model's weights from a file
|
||||
ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
|
||||
ModelLoadResult gptj_v2_model_load(const std::string & fname, gptj_v2_model & model, gpt_vocab & vocab, int gpulayers) {
|
||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
|
@ -382,8 +382,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
|||
//
|
||||
// The GPT-J model requires about 16MB of memory per input token.
|
||||
//
|
||||
bool gptj_eval(
|
||||
const gptj_model & model,
|
||||
bool gptj_v2_eval(
|
||||
const gptj_v2_model & model,
|
||||
const int n_threads,
|
||||
const int n_past,
|
||||
const std::vector<gpt_vocab::id> & embd_inp,
|
||||
|
|
613
otherarch/gptj_v3.cpp
Normal file
613
otherarch/gptj_v3.cpp
Normal file
|
@ -0,0 +1,613 @@
|
|||
#include "ggml.h"
|
||||
#include "otherarch.h"
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "model_adapter.h"
|
||||
|
||||
|
||||
|
||||
// load the model's weights from a file
|
||||
ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
|
||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
// verify magic
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 0x67676d6c) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
// load hparams
|
||||
{
|
||||
auto & hparams = model.hparams;
|
||||
|
||||
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
||||
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
|
||||
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
||||
printf("%s: qntvr = %d\n", __func__, qntvr);
|
||||
|
||||
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
||||
}
|
||||
|
||||
// load vocab
|
||||
{
|
||||
int32_t n_vocab = 0;
|
||||
fin.read((char *) &n_vocab, sizeof(n_vocab));
|
||||
|
||||
if (n_vocab != model.hparams.n_vocab) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
||||
__func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
std::string word;
|
||||
for (int i = 0; i < n_vocab; i++) {
|
||||
uint32_t len;
|
||||
fin.read((char *) &len, sizeof(len));
|
||||
|
||||
word.resize(len);
|
||||
fin.read((char *) word.data(), len);
|
||||
|
||||
vocab.token_to_id[word] = i;
|
||||
vocab.id_to_token[i] = word;
|
||||
}
|
||||
}
|
||||
|
||||
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||
// in order to save memory and also to speed up the computation
|
||||
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
||||
if (wtype == GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
||||
__func__, fname.c_str(), model.hparams.ftype);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
auto & ctx = model.ctx;
|
||||
|
||||
auto memory_type = GGML_TYPE_F16;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
||||
|
||||
ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
|
||||
|
||||
ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g
|
||||
ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
|
||||
|
||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
|
||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
|
||||
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w
|
||||
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
|
||||
|
||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
|
||||
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
|
||||
|
||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
||||
|
||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
|
||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
|
||||
|
||||
ctx_size += (5 + 10*n_layer)*512; // object overhead
|
||||
|
||||
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||
}
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = ctx_size;
|
||||
params.mem_buffer = NULL;
|
||||
params.no_alloc = false;
|
||||
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
if (!model.ctx) {
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
// prepare memory for the weights
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||
|
||||
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||
model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
|
||||
|
||||
// map by name
|
||||
model.tensors["transformer.wte.weight"] = model.wte;
|
||||
|
||||
model.tensors["transformer.ln_f.weight"] = model.ln_f_g;
|
||||
model.tensors["transformer.ln_f.bias"] = model.ln_f_b;
|
||||
|
||||
model.tensors["lm_head.weight"] = model.lmh_g;
|
||||
model.tensors["lm_head.bias"] = model.lmh_b;
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||
layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||
layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||
|
||||
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||
|
||||
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
|
||||
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
|
||||
|
||||
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
||||
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
// map by name
|
||||
model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"] = layer.ln_1_g;
|
||||
model.tensors["transformer.h." + std::to_string(i) + ".ln_1.bias"] = layer.ln_1_b;
|
||||
|
||||
model.tensors["transformer.h." + std::to_string(i) + ".attn.q_proj.weight"] = layer.c_attn_q_proj_w;
|
||||
model.tensors["transformer.h." + std::to_string(i) + ".attn.k_proj.weight"] = layer.c_attn_k_proj_w;
|
||||
model.tensors["transformer.h." + std::to_string(i) + ".attn.v_proj.weight"] = layer.c_attn_v_proj_w;
|
||||
|
||||
model.tensors["transformer.h." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w;
|
||||
|
||||
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"] = layer.c_mlp_fc_w;
|
||||
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"] = layer.c_mlp_fc_b;
|
||||
|
||||
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"] = layer.c_mlp_proj_w;
|
||||
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"] = layer.c_mlp_proj_b;
|
||||
}
|
||||
}
|
||||
|
||||
// key + value memory
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
|
||||
const int n_mem = n_layer*n_ctx;
|
||||
const int n_elements = n_embd*n_mem;
|
||||
|
||||
model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
|
||||
model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);
|
||||
|
||||
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
||||
|
||||
printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
||||
}
|
||||
|
||||
// load weights
|
||||
{
|
||||
int n_tensors = 0;
|
||||
size_t total_size = 0;
|
||||
|
||||
printf("%s: ", __func__);
|
||||
|
||||
while (true) {
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
int32_t ttype;
|
||||
|
||||
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
||||
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t nelements = 1;
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
nelements *= ne[i];
|
||||
}
|
||||
|
||||
std::string name(length, 0);
|
||||
fin.read(&name[0], length);
|
||||
|
||||
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
auto tensor = model.tensors[name.data()];
|
||||
if (ggml_nelements(tensor) != nelements) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||
|
||||
//test for transposition and retry older loader
|
||||
if(tensor->ne[0]==ne[1] && tensor->ne[1]==ne[0] && should_transpose_layer(name))
|
||||
{
|
||||
printf("\nFound a transposed tensor. This could be an older or newer model. Retrying load...");
|
||||
ggml_free(ctx);
|
||||
return ModelLoadResult::RETRY_LOAD;
|
||||
}
|
||||
else
|
||||
{
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
|
||||
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// for debugging
|
||||
if (0) {
|
||||
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
||||
|
||||
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
||||
|
||||
//printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
total_size += ggml_nbytes(tensor);
|
||||
if (++n_tensors % 8 == 0) {
|
||||
printf(".");
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
printf(" done\n");
|
||||
|
||||
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
||||
}
|
||||
|
||||
fin.close();
|
||||
|
||||
// //gpu offload for gptj
|
||||
// #if defined(GGML_USE_CLBLAST)
|
||||
// if(gpulayers>0)
|
||||
// {
|
||||
// const auto & hparams = model.hparams;
|
||||
// const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
|
||||
// if(GetQuantsUnshuffled())
|
||||
// {
|
||||
// SetGPULayers(n_gpu);
|
||||
|
||||
// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||
|
||||
// size_t vram_total = 0;
|
||||
|
||||
// for (int i = 0; i < n_gpu; ++i) {
|
||||
// const auto & layer = model.layers[i];
|
||||
|
||||
// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g);
|
||||
// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b);
|
||||
// ggml_cl_transform_tensor(layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
|
||||
// ggml_cl_transform_tensor(layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
|
||||
// ggml_cl_transform_tensor(layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
|
||||
// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
||||
// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
||||
// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b);
|
||||
// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
||||
// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b);
|
||||
// }
|
||||
|
||||
// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
// }
|
||||
// else
|
||||
// {
|
||||
// if(n_gpu>0)
|
||||
// {
|
||||
// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n");
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
// #endif
|
||||
|
||||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
|
||||
// evaluate the transformer
|
||||
//
|
||||
// - model: the model
|
||||
// - n_threads: number of threads to use
|
||||
// - n_past: the context size so far
|
||||
// - embd_inp: the embeddings of the tokens in the context
|
||||
// - embd_w: the predicted logits for the next token
|
||||
//
|
||||
// The GPT-J model requires about 16MB of memory per input token.
|
||||
//
|
||||
bool gptj_eval(
|
||||
const gptj_model & model,
|
||||
const int n_threads,
|
||||
const int n_past,
|
||||
const std::vector<gpt_vocab::id> & embd_inp,
|
||||
std::vector<float> & embd_w,
|
||||
size_t & mem_per_token) {
|
||||
const int N = embd_inp.size();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_head = hparams.n_head;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
const int n_rot = hparams.n_rot;
|
||||
|
||||
static size_t buf_size = 256u*1024*1024;
|
||||
static void * buf = malloc(buf_size);
|
||||
|
||||
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
|
||||
const size_t buf_size_new = 320u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||
|
||||
// reallocate
|
||||
if (buf_size_new > buf_size)
|
||||
{
|
||||
buf_size = buf_size_new;
|
||||
buf = realloc(buf, buf_size);
|
||||
if (buf == nullptr)
|
||||
{
|
||||
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = buf_size;
|
||||
params.mem_buffer = buf;
|
||||
params.no_alloc = false;
|
||||
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
struct ggml_cgraph gf = {};
|
||||
gf.n_threads = n_threads;
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||
|
||||
// wte
|
||||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
// norm
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpL);
|
||||
|
||||
// cur = ln_1_g*cur + ln_1_b
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
|
||||
cur),
|
||||
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
|
||||
}
|
||||
|
||||
struct ggml_tensor * inpSA = cur;
|
||||
|
||||
// self-attention
|
||||
{
|
||||
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||
|
||||
// store key and value to memory
|
||||
{
|
||||
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur));
|
||||
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
|
||||
( n_ctx)*ggml_element_size(model.memory_v),
|
||||
(il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
|
||||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||
}
|
||||
|
||||
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * Q =
|
||||
ggml_permute(ctx0,
|
||||
Qcur,
|
||||
0, 2, 1, 3);
|
||||
|
||||
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * K =
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
0, 2, 1, 3);
|
||||
|
||||
// K * Q
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
|
||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||
struct ggml_tensor * KQ_scaled =
|
||||
ggml_scale_inplace(ctx0,
|
||||
KQ,
|
||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
||||
);
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||
|
||||
// KQ = soft_max(KQ_masked)
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||
|
||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||
struct ggml_tensor * V =
|
||||
ggml_view_3d(ctx0, model.memory_v,
|
||||
n_past + N, n_embd/n_head, n_head,
|
||||
n_ctx*ggml_element_size(model.memory_v),
|
||||
n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
|
||||
il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
|
||||
|
||||
// KQV = transpose(V) * KQ_soft_max
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||
|
||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
|
||||
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||
cur = ggml_cpy(ctx0,
|
||||
KQV_merged,
|
||||
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||
|
||||
// projection (no bias)
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].c_attn_proj_w,
|
||||
cur);
|
||||
}
|
||||
|
||||
struct ggml_tensor * inpFF = cur;
|
||||
|
||||
// feed-forward network
|
||||
// this is independent of the self-attention result, so it could be done in parallel to the self-attention
|
||||
{
|
||||
// note here we pass inpSA instead of cur
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].c_mlp_fc_w,
|
||||
inpSA);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
|
||||
cur);
|
||||
|
||||
// GELU activation
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
// projection
|
||||
// cur = proj_w*cur + proj_b
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].c_mlp_proj_w,
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
|
||||
cur);
|
||||
}
|
||||
|
||||
// self-attention + FF
|
||||
cur = ggml_add(ctx0, cur, inpFF);
|
||||
|
||||
// input for next layer
|
||||
inpL = ggml_add(ctx0, cur, inpL);
|
||||
}
|
||||
|
||||
// norm
|
||||
{
|
||||
inpL = ggml_norm(ctx0, inpL);
|
||||
|
||||
// inpL = ln_f_g*inpL + ln_f_b
|
||||
inpL = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.ln_f_g, inpL),
|
||||
inpL),
|
||||
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
||||
}
|
||||
|
||||
// lm_head
|
||||
{
|
||||
inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
|
||||
|
||||
inpL = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.lmh_b, inpL),
|
||||
inpL);
|
||||
}
|
||||
|
||||
// logits -> probs
|
||||
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
||||
|
||||
// run the computation
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
ggml_graph_compute (ctx0, &gf);
|
||||
|
||||
//if (n_past%100 == 0) {
|
||||
// ggml_graph_print (&gf);
|
||||
// ggml_graph_dump_dot(&gf, NULL, "gpt-j.dot");
|
||||
//}
|
||||
|
||||
//embd_w.resize(n_vocab*N);
|
||||
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||
|
||||
// return result for just the last token
|
||||
embd_w.resize(n_vocab);
|
||||
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||
|
||||
if (mem_per_token == 0) {
|
||||
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||
}
|
||||
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
return true;
|
||||
}
|
464
otherarch/llama_v2-util.h
Normal file
464
otherarch/llama_v2-util.h
Normal file
|
@ -0,0 +1,464 @@
|
|||
// Internal header to be included only by llama.cpp.
|
||||
// Contains wrappers around OS interfaces.
|
||||
#pragma once
|
||||
#ifndef LLAMA_V2_UTIL_H
|
||||
#define LLAMA_V2_UTIL_H
|
||||
|
||||
#include "llama-util.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <cerrno>
|
||||
#include <cstring>
|
||||
#include <cstdarg>
|
||||
#include <cstdlib>
|
||||
#include <climits>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
|
||||
#ifdef __has_include
|
||||
#if __has_include(<unistd.h>)
|
||||
#include <unistd.h>
|
||||
#if defined(_POSIX_MAPPED_FILES)
|
||||
#include <sys/mman.h>
|
||||
#endif
|
||||
#if defined(_POSIX_MEMLOCK_RANGE)
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#include <io.h>
|
||||
#include <stdio.h> // for _fseeki64
|
||||
#endif
|
||||
|
||||
#define LLAMA_V2_ASSERT(x) \
|
||||
do { \
|
||||
if (!(x)) { \
|
||||
fprintf(stderr, "LLAMA_V2_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
||||
abort(); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
__attribute__((format(gnu_printf, 1, 2)))
|
||||
#else
|
||||
__attribute__((format(printf, 1, 2)))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
struct llama_v2_file {
|
||||
// use FILE * so we don't have to re-open the file to mmap
|
||||
FILE * fp;
|
||||
size_t size;
|
||||
|
||||
llama_v2_file(const char * fname, const char * mode) {
|
||||
fp = std::fopen(fname, mode);
|
||||
if (fp == NULL) {
|
||||
throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
|
||||
}
|
||||
seek(0, SEEK_END);
|
||||
size = tell();
|
||||
seek(0, SEEK_SET);
|
||||
}
|
||||
|
||||
size_t tell() const {
|
||||
#ifdef _WIN32
|
||||
__int64 ret = _ftelli64(fp);
|
||||
#else
|
||||
long ret = std::ftell(fp);
|
||||
#endif
|
||||
LLAMA_V2_ASSERT(ret != -1); // this really shouldn't fail
|
||||
return (size_t) ret;
|
||||
}
|
||||
|
||||
void seek(size_t offset, int whence) {
|
||||
#ifdef _WIN32
|
||||
int ret = _fseeki64(fp, (__int64) offset, whence);
|
||||
#else
|
||||
int ret = std::fseek(fp, (long) offset, whence);
|
||||
#endif
|
||||
LLAMA_V2_ASSERT(ret == 0); // same
|
||||
}
|
||||
|
||||
void read_raw(void * ptr, size_t size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
std::size_t ret = std::fread(ptr, size, 1, fp);
|
||||
if (ferror(fp)) {
|
||||
throw std::runtime_error(format("read error: %s", strerror(errno)));
|
||||
}
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error(std::string("unexpectedly reached end of file"));
|
||||
}
|
||||
}
|
||||
|
||||
std::uint32_t read_u32() {
|
||||
std::uint32_t ret;
|
||||
read_raw(&ret, sizeof(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
std::string read_string(std::uint32_t len) {
|
||||
std::vector<char> chars(len);
|
||||
read_raw(chars.data(), len);
|
||||
return std::string(chars.data(), len);
|
||||
}
|
||||
|
||||
void write_raw(const void * ptr, size_t size) {
|
||||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
errno = 0;
|
||||
size_t ret = std::fwrite(ptr, size, 1, fp);
|
||||
if (ret != 1) {
|
||||
throw std::runtime_error(format("write error: %s", strerror(errno)));
|
||||
}
|
||||
}
|
||||
|
||||
void write_u32(std::uint32_t val) {
|
||||
write_raw(&val, sizeof(val));
|
||||
}
|
||||
|
||||
~llama_v2_file() {
|
||||
if (fp) {
|
||||
std::fclose(fp);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#if defined(_WIN32)
|
||||
static std::string llama_v2_format_win_err(DWORD err) {
|
||||
LPSTR buf;
|
||||
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
|
||||
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
|
||||
if (!size) {
|
||||
return "FormatMessageA failed";
|
||||
}
|
||||
std::string ret(buf, size);
|
||||
LocalFree(buf);
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
struct llama_v2_mmap {
|
||||
void * addr;
|
||||
size_t size;
|
||||
|
||||
llama_v2_mmap(const llama_v2_mmap &) = delete;
|
||||
|
||||
#ifdef _POSIX_MAPPED_FILES
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_v2_mmap(struct llama_v2_file * file, bool prefetch = true) {
|
||||
size = file->size;
|
||||
int fd = fileno(file->fp);
|
||||
int flags = MAP_SHARED;
|
||||
#ifdef __linux__
|
||||
flags |= MAP_POPULATE;
|
||||
#endif
|
||||
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
|
||||
if (addr == MAP_FAILED) {
|
||||
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
|
||||
}
|
||||
|
||||
if (prefetch) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
if (madvise(addr, file->size, MADV_WILLNEED)) {
|
||||
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
|
||||
strerror(errno));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~llama_v2_mmap() {
|
||||
munmap(addr, size);
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
llama_v2_mmap(struct llama_v2_file * file, bool prefetch = true) {
|
||||
size = file->size;
|
||||
|
||||
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
|
||||
|
||||
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
|
||||
DWORD error = GetLastError();
|
||||
|
||||
if (hMapping == NULL) {
|
||||
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_v2_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
|
||||
error = GetLastError();
|
||||
CloseHandle(hMapping);
|
||||
|
||||
if (addr == NULL) {
|
||||
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_v2_format_win_err(error).c_str()));
|
||||
}
|
||||
|
||||
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
if (prefetch) {
|
||||
// Advise the kernel to preload the mapped memory
|
||||
WIN32_MEMORY_RANGE_ENTRY range;
|
||||
range.VirtualAddress = addr;
|
||||
range.NumberOfBytes = (SIZE_T)size;
|
||||
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||
llama_v2_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
||||
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||
}
|
||||
|
||||
~llama_v2_mmap() {
|
||||
if (!UnmapViewOfFile(addr)) {
|
||||
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
|
||||
llama_v2_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
llama_v2_mmap(struct llama_v2_file *, bool prefetch = true) {
|
||||
(void)prefetch;
|
||||
throw std::runtime_error(std::string("mmap not supported"));
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Represents some region of memory being locked using mlock or VirtualLock;
|
||||
// will automatically unlock on destruction.
|
||||
struct llama_v2_mlock {
|
||||
void * addr = NULL;
|
||||
size_t size = 0;
|
||||
bool failed_already = false;
|
||||
|
||||
llama_v2_mlock() {}
|
||||
llama_v2_mlock(const llama_v2_mlock &) = delete;
|
||||
|
||||
~llama_v2_mlock() {
|
||||
if (size) {
|
||||
raw_unlock(addr, size);
|
||||
}
|
||||
}
|
||||
|
||||
void init(void * addr) {
|
||||
LLAMA_V2_ASSERT(this->addr == NULL && this->size == 0);
|
||||
this->addr = addr;
|
||||
}
|
||||
|
||||
void grow_to(size_t target_size) {
|
||||
LLAMA_V2_ASSERT(addr);
|
||||
if (failed_already) {
|
||||
return;
|
||||
}
|
||||
size_t granularity = lock_granularity();
|
||||
target_size = (target_size + granularity - 1) & ~(granularity - 1);
|
||||
if (target_size > size) {
|
||||
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
|
||||
size = target_size;
|
||||
} else {
|
||||
failed_already = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _POSIX_MEMLOCK_RANGE
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
size_t lock_granularity() {
|
||||
return (size_t) sysconf(_SC_PAGESIZE);
|
||||
}
|
||||
|
||||
#ifdef __APPLE__
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
|
||||
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
|
||||
#else
|
||||
#define MLOCK_SUGGESTION \
|
||||
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
|
||||
#endif
|
||||
|
||||
bool raw_lock(const void * addr, size_t size) {
|
||||
if (!mlock(addr, size)) {
|
||||
return true;
|
||||
} else {
|
||||
char* errmsg = std::strerror(errno);
|
||||
bool suggest = (errno == ENOMEM);
|
||||
|
||||
// Check if the resource limit is fine after all
|
||||
struct rlimit lock_limit;
|
||||
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
|
||||
suggest = false;
|
||||
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
|
||||
suggest = false;
|
||||
|
||||
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
||||
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
#undef MLOCK_SUGGESTION
|
||||
|
||||
void raw_unlock(void * addr, size_t size) {
|
||||
if (munlock(addr, size)) {
|
||||
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
||||
}
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
static constexpr bool SUPPORTED = true;
|
||||
|
||||
size_t lock_granularity() {
|
||||
SYSTEM_INFO si;
|
||||
GetSystemInfo(&si);
|
||||
return (size_t) si.dwPageSize;
|
||||
}
|
||||
|
||||
bool raw_lock(void * addr, size_t size) {
|
||||
for (int tries = 1; ; tries++) {
|
||||
if (VirtualLock(addr, size)) {
|
||||
return true;
|
||||
}
|
||||
if (tries == 2) {
|
||||
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
||||
size, this->size, llama_v2_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// It failed but this was only the first try; increase the working
|
||||
// set size and try again.
|
||||
SIZE_T min_ws_size, max_ws_size;
|
||||
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
||||
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
|
||||
llama_v2_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
// Per MSDN: "The maximum number of pages that a process can lock
|
||||
// is equal to the number of pages in its minimum working set minus
|
||||
// a small overhead."
|
||||
// Hopefully a megabyte is enough overhead:
|
||||
size_t increment = size + 1048576;
|
||||
// The minimum must be <= the maximum, so we need to increase both:
|
||||
min_ws_size += increment;
|
||||
max_ws_size += increment;
|
||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
||||
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
||||
llama_v2_format_win_err(GetLastError()).c_str());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void raw_unlock(void * addr, size_t size) {
|
||||
if (!VirtualUnlock(addr, size)) {
|
||||
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
|
||||
llama_v2_format_win_err(GetLastError()).c_str());
|
||||
}
|
||||
}
|
||||
#else
|
||||
static constexpr bool SUPPORTED = false;
|
||||
|
||||
size_t lock_granularity() {
|
||||
return (size_t) 65536;
|
||||
}
|
||||
|
||||
bool raw_lock(const void * addr, size_t size) {
|
||||
fprintf(stderr, "warning: mlock not supported on this system\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
void raw_unlock(const void * addr, size_t size) {}
|
||||
#endif
|
||||
};
|
||||
|
||||
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
|
||||
struct llama_v2_buffer {
|
||||
uint8_t * addr = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
llama_v2_buffer() = default;
|
||||
|
||||
void resize(size_t size) {
|
||||
delete[] addr;
|
||||
addr = new uint8_t[size];
|
||||
this->size = size;
|
||||
}
|
||||
|
||||
~llama_v2_buffer() {
|
||||
delete[] addr;
|
||||
}
|
||||
|
||||
// disable copy and move
|
||||
llama_v2_buffer(const llama_v2_buffer&) = delete;
|
||||
llama_v2_buffer(llama_v2_buffer&&) = delete;
|
||||
llama_v2_buffer& operator=(const llama_v2_buffer&) = delete;
|
||||
llama_v2_buffer& operator=(llama_v2_buffer&&) = delete;
|
||||
};
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
struct llama_v2_ctx_buffer {
|
||||
uint8_t * addr = NULL;
|
||||
bool is_cuda;
|
||||
size_t size = 0;
|
||||
|
||||
llama_v2_ctx_buffer() = default;
|
||||
|
||||
void resize(size_t size) {
|
||||
free();
|
||||
|
||||
addr = (uint8_t *) ggml_cuda_host_malloc(size);
|
||||
if (addr) {
|
||||
is_cuda = true;
|
||||
}
|
||||
else {
|
||||
// fall back to pageable memory
|
||||
addr = new uint8_t[size];
|
||||
is_cuda = false;
|
||||
}
|
||||
this->size = size;
|
||||
}
|
||||
|
||||
void free() {
|
||||
if (addr) {
|
||||
if (is_cuda) {
|
||||
ggml_cuda_host_free(addr);
|
||||
}
|
||||
else {
|
||||
delete[] addr;
|
||||
}
|
||||
}
|
||||
addr = NULL;
|
||||
}
|
||||
|
||||
~llama_v2_ctx_buffer() {
|
||||
free();
|
||||
}
|
||||
|
||||
// disable copy and move
|
||||
llama_v2_ctx_buffer(const llama_v2_ctx_buffer&) = delete;
|
||||
llama_v2_ctx_buffer(llama_v2_ctx_buffer&&) = delete;
|
||||
llama_v2_ctx_buffer& operator=(const llama_v2_ctx_buffer&) = delete;
|
||||
llama_v2_ctx_buffer& operator=(llama_v2_ctx_buffer&&) = delete;
|
||||
};
|
||||
#else
|
||||
typedef llama_v2_buffer llama_v2_ctx_buffer;
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -5,7 +5,7 @@
|
|||
#include <cstdio>
|
||||
#endif
|
||||
|
||||
#include "llama-util.h"
|
||||
#include "llama_v2-util.h"
|
||||
#include "llama_v2.h"
|
||||
|
||||
#include "ggml_v2.h"
|
||||
|
@ -39,67 +39,67 @@
|
|||
#define LLAMA_V2_MAX_SCRATCH_BUFFERS 16
|
||||
|
||||
// available llama models
|
||||
enum e_model {
|
||||
MODEL_UNKNOWN,
|
||||
MODEL_7B,
|
||||
MODEL_13B,
|
||||
MODEL_30B,
|
||||
MODEL_65B,
|
||||
enum e_model2 {
|
||||
MODEL_UNKNOWN_2,
|
||||
MODEL_7B_2,
|
||||
MODEL_13B_2,
|
||||
MODEL_30B_2,
|
||||
MODEL_65B_2,
|
||||
};
|
||||
|
||||
static const size_t MB = 1024*1024;
|
||||
static const size_t MB_2 = 1024*1024;
|
||||
|
||||
// computed for n_ctx == 2048
|
||||
// TODO: dynamically determine these sizes
|
||||
// needs modifications in ggml
|
||||
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||
static const std::map<e_model2, size_t> & MEM_REQ_SCRATCH0_2()
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_UNKNOWN, 512ull * MB },
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
static std::map<e_model2, size_t> k_sizes = {
|
||||
{ MODEL_UNKNOWN_2, 512ull * MB_2 },
|
||||
{ MODEL_7B_2, 512ull * MB_2 },
|
||||
{ MODEL_13B_2, 512ull * MB_2 },
|
||||
{ MODEL_30B_2, 512ull * MB_2 },
|
||||
{ MODEL_65B_2, 1024ull * MB_2 },
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
|
||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||
static const std::map<e_model2, size_t> & MEM_REQ_SCRATCH1_2()
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_UNKNOWN, 512ull * MB },
|
||||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
static std::map<e_model2, size_t> k_sizes = {
|
||||
{ MODEL_UNKNOWN_2, 512ull * MB_2 },
|
||||
{ MODEL_7B_2, 512ull * MB_2 },
|
||||
{ MODEL_13B_2, 512ull * MB_2 },
|
||||
{ MODEL_30B_2, 512ull * MB_2 },
|
||||
{ MODEL_65B_2, 1024ull * MB_2 },
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
|
||||
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
||||
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
||||
static const std::map<e_model2, size_t> & MEM_REQ_KV_SELF_2()
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_UNKNOWN, 1026ull * MB },
|
||||
{ MODEL_7B, 1026ull * MB },
|
||||
{ MODEL_13B, 1608ull * MB },
|
||||
{ MODEL_30B, 3124ull * MB },
|
||||
{ MODEL_65B, 5120ull * MB },
|
||||
static std::map<e_model2, size_t> k_sizes = {
|
||||
{ MODEL_UNKNOWN_2, 1026ull * MB_2 },
|
||||
{ MODEL_7B_2, 1026ull * MB_2 },
|
||||
{ MODEL_13B_2, 1608ull * MB_2 },
|
||||
{ MODEL_30B_2, 3124ull * MB_2 },
|
||||
{ MODEL_65B_2, 5120ull * MB_2 },
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
|
||||
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
||||
// not actually needed if BLAS is disabled
|
||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||
static const std::map<e_model2, size_t> & MEM_REQ_EVAL_2()
|
||||
{
|
||||
static std::map<e_model, size_t> k_sizes = {
|
||||
{ MODEL_UNKNOWN, 800ull * MB },
|
||||
{ MODEL_7B, 800ull * MB },
|
||||
{ MODEL_13B, 1024ull * MB },
|
||||
{ MODEL_30B, 1280ull * MB },
|
||||
{ MODEL_65B, 1536ull * MB },
|
||||
static std::map<e_model2, size_t> k_sizes = {
|
||||
{ MODEL_UNKNOWN_2, 800ull * MB_2 },
|
||||
{ MODEL_7B_2, 800ull * MB_2 },
|
||||
{ MODEL_13B_2, 1024ull * MB_2 },
|
||||
{ MODEL_30B_2, 1280ull * MB_2 },
|
||||
{ MODEL_65B_2, 1536ull * MB_2 },
|
||||
};
|
||||
return k_sizes;
|
||||
}
|
||||
|
@ -157,7 +157,7 @@ struct llama_v2_kv_cache {
|
|||
};
|
||||
|
||||
struct llama_v2_model {
|
||||
e_model type = MODEL_UNKNOWN;
|
||||
e_model2 type = MODEL_UNKNOWN_2;
|
||||
|
||||
llama_v2_hparams hparams;
|
||||
|
||||
|
@ -276,7 +276,7 @@ struct llama_v2_context {
|
|||
};
|
||||
|
||||
template <typename T>
|
||||
static T checked_mul(T a, T b) {
|
||||
static T checked_mul2(T a, T b) {
|
||||
T ret = a * b;
|
||||
if (a != 0 && ret / a != b) {
|
||||
throw format("overflow multiplying %llu * %llu",
|
||||
|
@ -285,7 +285,7 @@ static T checked_mul(T a, T b) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
static size_t checked_div(size_t a, size_t b) {
|
||||
static size_t checked_div2(size_t a, size_t b) {
|
||||
if (b == 0 || a % b != 0) {
|
||||
throw format("error dividing %zu / %zu", a, b);
|
||||
}
|
||||
|
@ -304,7 +304,7 @@ static std::string llama_v2_format_tensor_shape(const std::vector<uint32_t> & ne
|
|||
static size_t llama_v2_calc_tensor_size(const std::vector<uint32_t> & ne, enum ggml_v2_type type) {
|
||||
size_t size = ggml_v2_type_size(type);
|
||||
for (uint32_t dim : ne) {
|
||||
size = checked_mul<size_t>(size, dim);
|
||||
size = checked_mul2<size_t>(size, dim);
|
||||
}
|
||||
return size / ggml_v2_blck_size(type);
|
||||
}
|
||||
|
@ -322,9 +322,9 @@ struct llama_v2_load_tensor_shard {
|
|||
};
|
||||
|
||||
enum llama_v2_split_type {
|
||||
SPLIT_NONE,
|
||||
SPLIT_BY_COLUMNS,
|
||||
SPLIT_BY_ROWS
|
||||
SPLIT_NONE_2,
|
||||
SPLIT_BY_COLUMNS_2,
|
||||
SPLIT_BY_ROWS_2
|
||||
};
|
||||
|
||||
struct llama_v2_load_tensor {
|
||||
|
@ -332,7 +332,7 @@ struct llama_v2_load_tensor {
|
|||
|
||||
std::string name;
|
||||
enum ggml_v2_type type = GGML_V2_TYPE_F32;
|
||||
llama_v2_split_type split_type = SPLIT_NONE;
|
||||
llama_v2_split_type split_type = SPLIT_NONE_2;
|
||||
std::vector<uint32_t> ne;
|
||||
size_t size;
|
||||
struct ggml_v2_tensor * ggml_v2_tensor = NULL;
|
||||
|
@ -360,13 +360,13 @@ struct llama_v2_load_tensor {
|
|||
void calc_split_type() {
|
||||
if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file
|
||||
shards.size() == 1) { // only one file?
|
||||
split_type = SPLIT_NONE;
|
||||
split_type = SPLIT_NONE_2;
|
||||
} else if (name.find("tok_embeddings.") == 0 ||
|
||||
name.find(".attention.wo.weight") != std::string::npos ||
|
||||
name.find(".feed_forward.w2.weight") != std::string::npos) {
|
||||
split_type = SPLIT_BY_COLUMNS;
|
||||
split_type = SPLIT_BY_COLUMNS_2;
|
||||
} else {
|
||||
split_type = SPLIT_BY_ROWS;
|
||||
split_type = SPLIT_BY_ROWS_2;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -382,16 +382,16 @@ struct llama_v2_load_tensor {
|
|||
LLAMA_V2_ASSERT(shards.size() <= UINT32_MAX);
|
||||
uint32_t n_shards = (uint32_t) shards.size();
|
||||
switch (split_type) {
|
||||
case SPLIT_NONE:
|
||||
case SPLIT_NONE_2:
|
||||
ne = first_shard.ne;
|
||||
break;
|
||||
case SPLIT_BY_COLUMNS:
|
||||
ne = {checked_mul<uint32_t>(first_shard.ne[0], n_shards),
|
||||
case SPLIT_BY_COLUMNS_2:
|
||||
ne = {checked_mul2<uint32_t>(first_shard.ne[0], n_shards),
|
||||
first_shard.ne[1]};
|
||||
break;
|
||||
case SPLIT_BY_ROWS:
|
||||
case SPLIT_BY_ROWS_2:
|
||||
ne = {first_shard.ne[0],
|
||||
checked_mul<uint32_t>(first_shard.ne[1], n_shards)};
|
||||
checked_mul2<uint32_t>(first_shard.ne[1], n_shards)};
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -737,11 +737,11 @@ struct llama_v2_model_loader {
|
|||
if (use_mmap) {
|
||||
LLAMA_V2_ASSERT(lt.shards.size() == 1);
|
||||
lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off;
|
||||
} else if (lt.split_type == SPLIT_NONE) {
|
||||
} else if (lt.split_type == SPLIT_NONE_2) {
|
||||
llama_v2_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file;
|
||||
file.seek(lt.shards.at(0).file_off, SEEK_SET);
|
||||
file.read_raw(lt.data, lt.size);
|
||||
} else if (lt.split_type == SPLIT_BY_ROWS) {
|
||||
} else if (lt.split_type == SPLIT_BY_ROWS_2) {
|
||||
size_t offset = 0;
|
||||
for (llama_v2_load_tensor_shard & shard : lt.shards) {
|
||||
llama_v2_file & file = file_loaders.at(shard.file_idx)->file;
|
||||
|
@ -750,7 +750,7 @@ struct llama_v2_model_loader {
|
|||
offset += shard.size;
|
||||
}
|
||||
LLAMA_V2_ASSERT(offset == lt.size);
|
||||
} else if (lt.split_type == SPLIT_BY_COLUMNS) {
|
||||
} else if (lt.split_type == SPLIT_BY_COLUMNS_2) {
|
||||
// Let's load the data into temporary buffers to ensure the OS performs large loads.
|
||||
std::vector<llama_v2_buffer> tmp_bufs(lt.shards.size());
|
||||
for (size_t i = 0; i < lt.shards.size(); i++) {
|
||||
|
@ -807,7 +807,7 @@ static bool kv_cache_init(
|
|||
const int64_t n_mem = n_layer*n_ctx;
|
||||
const int64_t n_elements = n_embd*n_mem;
|
||||
|
||||
cache.buf.resize(2u*n_elements*ggml_v2_type_size(wtype) + 2u*MB);
|
||||
cache.buf.resize(2u*n_elements*ggml_v2_type_size(wtype) + 2u*MB_2);
|
||||
|
||||
struct ggml_v2_init_params params;
|
||||
params.mem_size = cache.buf.size;
|
||||
|
@ -888,12 +888,12 @@ static const char *llama_v2_ftype_name(enum llama_v2_ftype ftype) {
|
|||
}
|
||||
}
|
||||
|
||||
static const char *llama_v2_model_type_name(e_model type) {
|
||||
static const char *llama_v2_model_type_name(e_model2 type) {
|
||||
switch (type) {
|
||||
case MODEL_7B: return "7B";
|
||||
case MODEL_13B: return "13B";
|
||||
case MODEL_30B: return "30B";
|
||||
case MODEL_65B: return "65B";
|
||||
case MODEL_7B_2: return "7B";
|
||||
case MODEL_13B_2: return "13B";
|
||||
case MODEL_30B_2: return "30B";
|
||||
case MODEL_65B_2: return "65B";
|
||||
default:
|
||||
printf("\nWARNING: NON-STANDARD LLAMA FILE DETECTED. DEFAULT TO 7B SIZE.\n");
|
||||
return "UNKNOWN";
|
||||
|
@ -925,11 +925,11 @@ static void llama_v2_model_load_internal(
|
|||
|
||||
{
|
||||
switch (hparams.n_layer) {
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 40: model.type = e_model::MODEL_13B; break;
|
||||
case 60: model.type = e_model::MODEL_30B; break;
|
||||
case 80: model.type = e_model::MODEL_65B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN; break;
|
||||
case 32: model.type = e_model2::MODEL_7B_2; break;
|
||||
case 40: model.type = e_model2::MODEL_13B_2; break;
|
||||
case 60: model.type = e_model2::MODEL_30B_2; break;
|
||||
case 80: model.type = e_model2::MODEL_65B_2; break;
|
||||
default: model.type = e_model2::MODEL_UNKNOWN_2; break;
|
||||
}
|
||||
|
||||
hparams.n_ctx = n_ctx;
|
||||
|
@ -985,13 +985,13 @@ static void llama_v2_model_load_internal(
|
|||
const size_t mem_required =
|
||||
ctx_size +
|
||||
mmapped_size +
|
||||
MEM_REQ_SCRATCH0().at(model.type) +
|
||||
MEM_REQ_SCRATCH1().at(model.type) +
|
||||
MEM_REQ_EVAL().at(model.type);
|
||||
MEM_REQ_SCRATCH0_2().at(model.type) +
|
||||
MEM_REQ_SCRATCH1_2().at(model.type) +
|
||||
MEM_REQ_EVAL_2().at(model.type);
|
||||
|
||||
// this is the memory required by one llama_v2_state
|
||||
const size_t mem_required_state =
|
||||
scale*MEM_REQ_KV_SELF().at(model.type);
|
||||
scale*MEM_REQ_KV_SELF_2().at(model.type);
|
||||
|
||||
fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__,
|
||||
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
|
||||
|
@ -1462,7 +1462,7 @@ static bool llama_v2_eval_internal(
|
|||
// tokenizer
|
||||
//
|
||||
|
||||
static size_t utf8_len(char src) {
|
||||
static size_t utf8_len2(char src) {
|
||||
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
||||
return lookup[highbits];
|
||||
|
@ -1503,7 +1503,7 @@ struct llama_v2_tokenizer {
|
|||
size_t offs = 0;
|
||||
while (offs < text.size()) {
|
||||
llama_v2_sp_symbol sym;
|
||||
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
|
||||
size_t char_len = std::min(text.size() - offs, utf8_len2(text[offs]));
|
||||
sym.text = text.c_str() + offs;
|
||||
sym.n = char_len;
|
||||
offs += char_len;
|
||||
|
@ -2251,10 +2251,10 @@ struct llama_v2_context * llama_v2_init_from_file(
|
|||
ctx->embedding.resize(hparams.n_embd);
|
||||
}
|
||||
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
|
||||
ctx->buf_compute.resize(MEM_REQ_EVAL_2().at(ctx->model.type));
|
||||
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0_2().at(ctx->model.type));
|
||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1_2().at(ctx->model.type));
|
||||
}
|
||||
|
||||
return ctx;
|
||||
|
@ -3077,5 +3077,15 @@ std::vector<llama_v2_token> legacy_llama_v2_tokenize(struct llama_v2_context * c
|
|||
int n = legacy_llama_v2_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
||||
res.resize(n);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<llama_token> llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos) {
|
||||
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
|
||||
std::vector<llama_token> res(text.size() + (int) add_bos);
|
||||
const int n = llama_v2_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
|
||||
assert(n >= 0);
|
||||
res.resize(n);
|
||||
|
||||
return res;
|
||||
}
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
|
||||
// load the model's weights from a file
|
||||
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) {
|
||||
ModelLoadResult gpt_neox_v2_model_load(const std::string & fname, gpt_neox_v2_model & model, gpt_vocab & vocab, FileFormat file_format) {
|
||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
|
@ -344,7 +344,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
|||
|
||||
// feed-forward network
|
||||
ggml_v2_tensor * gpt_neox_ff(
|
||||
const gpt_neox_layer &layer,
|
||||
const gpt_neox_layer_v2 &layer,
|
||||
ggml_v2_context * ctx0,
|
||||
ggml_v2_tensor * inp) {
|
||||
ggml_v2_tensor * cur = ggml_v2_norm(ctx0, inp);
|
||||
|
@ -386,8 +386,8 @@ ggml_v2_tensor * gpt_neox_ff(
|
|||
// - embd_inp: the embeddings of the tokens in the context
|
||||
// - embd_w: the predicted logits for the next token
|
||||
//
|
||||
bool gpt_neox_eval(
|
||||
const gpt_neox_model & model,
|
||||
bool gpt_neox_v2_eval(
|
||||
const gpt_neox_v2_model & model,
|
||||
const int n_threads,
|
||||
const int n_past,
|
||||
const std::vector<gpt_vocab::id> & embd_inp,
|
||||
|
|
613
otherarch/neox_v3.cpp
Normal file
613
otherarch/neox_v3.cpp
Normal file
|
@ -0,0 +1,613 @@
|
|||
#include "ggml.h"
|
||||
#include "otherarch.h"
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
|
||||
|
||||
// load the model's weights from a file
|
||||
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) {
|
||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||
|
||||
auto fin = std::ifstream(fname, std::ios::binary);
|
||||
if (!fin) {
|
||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
// verify magic
|
||||
{
|
||||
uint32_t magic;
|
||||
fin.read((char *) &magic, sizeof(magic));
|
||||
if (magic != 0x67676d6c) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
// load hparams
|
||||
{
|
||||
auto & hparams = model.hparams;
|
||||
hparams.par_res = 1; //true
|
||||
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
||||
if(file_format!=FileFormat::NEOX_1 && file_format!=FileFormat::NEOX_2 && file_format!=FileFormat::NEOX_3)
|
||||
{
|
||||
fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
|
||||
}
|
||||
if(file_format==FileFormat::NEOX_3)
|
||||
{
|
||||
hparams.par_res = 0;
|
||||
}
|
||||
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||
|
||||
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||
|
||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
|
||||
printf("%s: par_res = %d\n", __func__, hparams.par_res);
|
||||
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
||||
printf("%s: qntvr = %d\n", __func__, qntvr);
|
||||
|
||||
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
||||
}
|
||||
|
||||
// load vocab
|
||||
{
|
||||
const int32_t n_vocab = model.hparams.n_vocab;
|
||||
|
||||
std::string word;
|
||||
for (int i = 0; i < n_vocab; i++) {
|
||||
uint32_t len;
|
||||
fin.read((char *) &len, sizeof(len));
|
||||
|
||||
word.resize(len);
|
||||
fin.read((char *) word.data(), len);
|
||||
|
||||
vocab.token_to_id[word] = i;
|
||||
vocab.id_to_token[i] = word;
|
||||
}
|
||||
}
|
||||
|
||||
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||
// in order to save memory and also to speed up the computation
|
||||
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
||||
if (wtype == GGML_TYPE_COUNT) {
|
||||
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
||||
__func__, fname.c_str(), model.hparams.ftype);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
auto & ctx = model.ctx;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
||||
|
||||
ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
|
||||
|
||||
ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g
|
||||
//ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
|
||||
|
||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
|
||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
|
||||
|
||||
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
|
||||
ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
|
||||
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
|
||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
|
||||
|
||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
|
||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
|
||||
|
||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
|
||||
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
|
||||
|
||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
||||
|
||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
||||
|
||||
ctx_size += (6 + 16*n_layer)*512; // object overhead
|
||||
|
||||
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||
}
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = ctx_size;
|
||||
params.mem_buffer = NULL;
|
||||
params.no_alloc = false;
|
||||
|
||||
model.ctx = ggml_init(params);
|
||||
if (!model.ctx) {
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
// prepare memory for the weights
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
|
||||
model.layers.resize(n_layer);
|
||||
|
||||
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||
|
||||
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||
//model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
|
||||
|
||||
// map by name
|
||||
model.tensors["gpt_neox.embed_in.weight"] = model.wte;
|
||||
|
||||
model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g;
|
||||
model.tensors["gpt_neox.final_layer_norm.bias"] = model.ln_f_b;
|
||||
|
||||
model.tensors["embed_out.weight"] = model.lmh_g;
|
||||
//model.tensors["lm_head.bias"] = model.lmh_b;
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
|
||||
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
|
||||
|
||||
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
|
||||
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
|
||||
|
||||
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
||||
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||
|
||||
// map by name
|
||||
model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g;
|
||||
model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"] = layer.ln_1_b;
|
||||
|
||||
model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w;
|
||||
model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"] = layer.c_attn_attn_b;
|
||||
|
||||
model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w;
|
||||
model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"] = layer.c_attn_proj_b;
|
||||
|
||||
model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g;
|
||||
model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"] = layer.ln_2_b;
|
||||
|
||||
model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w;
|
||||
model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"] = layer.c_mlp_fc_b;
|
||||
|
||||
model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w;
|
||||
model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"] = layer.c_mlp_proj_b;
|
||||
}
|
||||
}
|
||||
|
||||
// key + value memory
|
||||
{
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
|
||||
const int64_t n_mem = n_layer*n_ctx;
|
||||
const int64_t n_elements = n_embd*n_mem;
|
||||
|
||||
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
||||
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
|
||||
|
||||
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
||||
|
||||
printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
||||
}
|
||||
|
||||
// load weights
|
||||
{
|
||||
int n_tensors = 0;
|
||||
size_t total_size = 0;
|
||||
|
||||
printf("%s: ", __func__);
|
||||
|
||||
while (true) {
|
||||
int32_t n_dims;
|
||||
int32_t length;
|
||||
int32_t ttype;
|
||||
|
||||
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
||||
|
||||
if (fin.eof()) {
|
||||
break;
|
||||
}
|
||||
|
||||
int32_t nelements = 1;
|
||||
int32_t ne[2] = { 1, 1 };
|
||||
for (int i = 0; i < n_dims; ++i) {
|
||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||
nelements *= ne[i];
|
||||
}
|
||||
|
||||
std::string name(length, 0);
|
||||
fin.read(&name[0], length);
|
||||
|
||||
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
auto tensor = model.tensors[name.data()];
|
||||
if (ggml_nelements(tensor) != nelements) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
|
||||
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
|
||||
// for debugging
|
||||
if (0) {
|
||||
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
size_t bpe = ggml_type_size(ggml_type(ttype));
|
||||
|
||||
if(file_format==FileFormat::NEOX_1)
|
||||
{
|
||||
switch (ttype) {
|
||||
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
|
||||
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
|
||||
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
|
||||
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
|
||||
default:
|
||||
{
|
||||
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ttype);
|
||||
return ModelLoadResult::FAIL;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
||||
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||
ggml_free(ctx);
|
||||
return ModelLoadResult::RETRY_LOAD;
|
||||
}
|
||||
|
||||
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
||||
|
||||
total_size += ggml_nbytes(tensor);
|
||||
if (++n_tensors % 8 == 0) {
|
||||
printf(".");
|
||||
fflush(stdout);
|
||||
}
|
||||
}
|
||||
|
||||
printf(" done\n");
|
||||
|
||||
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
||||
}
|
||||
|
||||
fin.close();
|
||||
|
||||
return ModelLoadResult::SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
// feed-forward network
|
||||
ggml_tensor * gpt_neox_ff(
|
||||
const gpt_neox_layer &layer,
|
||||
ggml_context * ctx0,
|
||||
ggml_tensor * inp) {
|
||||
ggml_tensor * cur = ggml_norm(ctx0, inp);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, layer.ln_2_g, cur),
|
||||
cur),
|
||||
ggml_repeat(ctx0, layer.ln_2_b, cur));
|
||||
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
layer.c_mlp_fc_w,
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, layer.c_mlp_fc_b, cur),
|
||||
cur);
|
||||
|
||||
// GELU activation
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
|
||||
// projection
|
||||
// cur = proj_w*cur + proj_b
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
layer.c_mlp_proj_w,
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, layer.c_mlp_proj_b, cur),
|
||||
cur);
|
||||
return cur;
|
||||
}
|
||||
|
||||
// evaluate the transformer
|
||||
//
|
||||
// - model: the model
|
||||
// - n_threads: number of threads to use
|
||||
// - n_past: the context size so far
|
||||
// - embd_inp: the embeddings of the tokens in the context
|
||||
// - embd_w: the predicted logits for the next token
|
||||
//
|
||||
bool gpt_neox_eval(
|
||||
const gpt_neox_model & model,
|
||||
const int n_threads,
|
||||
const int n_past,
|
||||
const std::vector<gpt_vocab::id> & embd_inp,
|
||||
std::vector<float> & embd_w,
|
||||
size_t & mem_per_token) {
|
||||
const int N = embd_inp.size();
|
||||
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int n_embd = hparams.n_embd;
|
||||
const int n_layer = hparams.n_layer;
|
||||
const int n_ctx = hparams.n_ctx;
|
||||
const int n_head = hparams.n_head;
|
||||
const int n_vocab = hparams.n_vocab;
|
||||
const int n_rot = hparams.n_rot;
|
||||
|
||||
static size_t buf_size = 256u*1024*1024;
|
||||
static void * buf = malloc(buf_size);
|
||||
|
||||
if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) {
|
||||
const size_t buf_size_new = 360u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||
|
||||
// reallocate
|
||||
buf_size = buf_size_new;
|
||||
buf = realloc(buf, buf_size);
|
||||
if (buf == nullptr) {
|
||||
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
struct ggml_init_params params;
|
||||
params.mem_size = buf_size;
|
||||
params.mem_buffer = buf;
|
||||
params.no_alloc = false;
|
||||
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(params);
|
||||
struct ggml_cgraph gf = {};
|
||||
gf.n_threads = n_threads;
|
||||
|
||||
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||
|
||||
// wte
|
||||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * cur;
|
||||
|
||||
// self-attention
|
||||
{
|
||||
{
|
||||
cur = ggml_norm(ctx0, inpL);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
|
||||
cur),
|
||||
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
|
||||
}
|
||||
|
||||
// compute QKV
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].c_attn_attn_w,
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0,
|
||||
ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
|
||||
cur);
|
||||
}
|
||||
|
||||
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head));
|
||||
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head));
|
||||
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head));
|
||||
|
||||
// using mode = 2 for GPT-NeoX mode
|
||||
Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2);
|
||||
Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2);
|
||||
|
||||
// store key and value to memory
|
||||
{
|
||||
Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N));
|
||||
|
||||
struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
|
||||
struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd,
|
||||
( n_ctx)*ggml_element_size(model.memory_v),
|
||||
(il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v));
|
||||
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||
}
|
||||
|
||||
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * Q =
|
||||
ggml_permute(ctx0,
|
||||
Qcur,
|
||||
0, 2, 1, 3);
|
||||
|
||||
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * K =
|
||||
ggml_permute(ctx0,
|
||||
ggml_reshape_3d(ctx0,
|
||||
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
|
||||
n_embd/n_head, n_head, n_past + N),
|
||||
0, 2, 1, 3);
|
||||
|
||||
// K * Q
|
||||
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||
|
||||
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||
struct ggml_tensor * KQ_scaled =
|
||||
ggml_scale_inplace(ctx0,
|
||||
KQ,
|
||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
||||
);
|
||||
|
||||
// KQ_masked = mask_past(KQ_scaled)
|
||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||
|
||||
// KQ = soft_max(KQ_masked)
|
||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||
|
||||
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||
struct ggml_tensor * V =
|
||||
ggml_view_3d(ctx0, model.memory_v,
|
||||
n_past + N, n_embd/n_head, n_head,
|
||||
n_ctx*ggml_element_size(model.memory_v),
|
||||
n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head,
|
||||
il*n_ctx*ggml_element_size(model.memory_v)*n_embd);
|
||||
|
||||
// KQV = transpose(V) * KQ_soft_max
|
||||
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||
|
||||
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||
|
||||
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||
cur = ggml_cpy(ctx0,
|
||||
KQV_merged,
|
||||
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||
|
||||
// projection
|
||||
{
|
||||
cur = ggml_mul_mat(ctx0,
|
||||
model.layers[il].c_attn_proj_w,
|
||||
cur);
|
||||
|
||||
cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur);
|
||||
}
|
||||
}
|
||||
|
||||
if (hparams.par_res == 0) {
|
||||
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
|
||||
|
||||
cur = gpt_neox_ff(model.layers[il], ctx0, inpFF);
|
||||
|
||||
// input for next layer
|
||||
inpL = ggml_add(ctx0, cur, inpFF);
|
||||
} else {
|
||||
struct ggml_tensor * inpFF = cur;
|
||||
|
||||
// this is independent of the self-attention result, so it could be done in parallel to the self-attention
|
||||
// note here we pass inpL instead of cur
|
||||
cur = gpt_neox_ff(model.layers[il], ctx0, inpL);
|
||||
|
||||
// layer input + FF
|
||||
cur = ggml_add(ctx0, cur, inpFF);
|
||||
|
||||
// input for next layer
|
||||
inpL = ggml_add(ctx0, cur, inpL);
|
||||
}
|
||||
}
|
||||
|
||||
// norm
|
||||
{
|
||||
inpL = ggml_norm(ctx0, inpL);
|
||||
|
||||
// inpL = ln_f_g*inpL + ln_f_b
|
||||
inpL = ggml_add(ctx0,
|
||||
ggml_mul(ctx0,
|
||||
ggml_repeat(ctx0, model.ln_f_g, inpL),
|
||||
inpL),
|
||||
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
||||
}
|
||||
|
||||
// lm_head
|
||||
{
|
||||
inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
|
||||
|
||||
//inpL = ggml_add(ctx0,
|
||||
// ggml_repeat(ctx0, model.lmh_b, inpL),
|
||||
// inpL);
|
||||
}
|
||||
|
||||
// logits -> probs
|
||||
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
||||
|
||||
// run the computation
|
||||
ggml_build_forward_expand(&gf, inpL);
|
||||
ggml_graph_compute (ctx0, &gf);
|
||||
|
||||
//if (n_past%100 == 0) {
|
||||
// ggml_graph_print (&gf);
|
||||
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
||||
//}
|
||||
|
||||
//embd_w.resize(n_vocab*N);
|
||||
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||
|
||||
// return result for just the last token
|
||||
embd_w.resize(n_vocab);
|
||||
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||
|
||||
if (mem_per_token == 0) {
|
||||
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||
}
|
||||
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
return true;
|
||||
}
|
|
@ -87,7 +87,7 @@ struct gptj_layer_v1 {
|
|||
struct ggml_v1_tensor * c_mlp_proj_b;
|
||||
};
|
||||
|
||||
struct gptj_model_v1 {
|
||||
struct gptj_v1_model {
|
||||
gptj_hparams hparams;
|
||||
|
||||
// normalization
|
||||
|
@ -110,7 +110,7 @@ struct gptj_model_v1 {
|
|||
std::map<std::string, struct ggml_v1_tensor *> tensors;
|
||||
};
|
||||
|
||||
struct gptj_model_v2 {
|
||||
struct gptj_v2_model {
|
||||
gptj_hparams hparams;
|
||||
|
||||
// normalization
|
||||
|
@ -122,7 +122,7 @@ struct gptj_model_v2 {
|
|||
struct ggml_v2_tensor * lmh_g; // language model head
|
||||
struct ggml_v2_tensor * lmh_b; // language model bias
|
||||
|
||||
std::vector<gptj_layer> layers;
|
||||
std::vector<gptj_layer_v2> layers;
|
||||
|
||||
// key + value memory
|
||||
struct ggml_v2_tensor * memory_k;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue