diff --git a/CMakeLists.txt b/CMakeLists.txt index 78346ded7..aa50a490a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -285,6 +285,14 @@ target_compile_features(ggml_v1 PUBLIC c_std_11) # don't bump target_link_libraries(ggml_v1 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) set_target_properties(ggml_v1 PROPERTIES POSITION_INDEPENDENT_CODE ON) +add_library(ggml_v2 OBJECT + otherarch/ggml_v2.c + otherarch/ggml_v2.h) +target_include_directories(ggml_v2 PUBLIC . ./otherarch ./otherarch/tools) +target_compile_features(ggml_v2 PUBLIC c_std_11) # don't bump +target_link_libraries(ggml_v2 PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) +set_target_properties(ggml_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON) + add_library(common2 examples/common.cpp examples/common.h) diff --git a/Makefile b/Makefile index e4d9ed52b..840463749 100644 --- a/Makefile +++ b/Makefile @@ -228,7 +228,21 @@ ggml_clblast.o: ggml.c ggml.h $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ -ggml-opencl-legacy.o: ggml-opencl-legacy.c ggml-opencl-legacy.h + +#version 2 libs +ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h + $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) -c $< -o $@ +ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h + $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(OPENBLAS_FLAGS) -c $< -o $@ +ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h + $(CC) $(CFLAGS) -c $< -o $@ +ggml_v2_openblas_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h + $(CC) $(CFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ +ggml_v2_clblast.o: otherarch/ggml_v2.c otherarch/ggml_v2.h + $(CC) $(CFLAGS) $(BONUSCFLAGS1) $(BONUSCFLAGS2) $(CLBLAST_FLAGS) -c $< -o $@ +ggml_v2-opencl.o: otherarch/ggml_v2-opencl.cpp otherarch/ggml_v2-opencl.h + $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ +ggml_v2-opencl-legacy.o: otherarch/ggml_v2-opencl-legacy.c otherarch/ggml_v2-opencl-legacy.h $(CC) $(CFLAGS) -c $< -o $@ #extreme old version compat @@ -264,19 +278,15 @@ main: examples/main/main.cpp build-info.h ggml.o llama.o common.o $(OBJS) @echo '==== Run ./main -h for help. ====' @echo -koboldcpp: ggml.o ggml_v1.o expose.o common.o gpttype_adapter.o $(OBJS) +koboldcpp: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(OBJS) $(DEFAULT_BUILD) - -koboldcpp_openblas: ggml_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o - $(OPENBLAS_BUILD) - -koboldcpp_noavx2: ggml_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o +koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o + $(OPENBLAS_BUILD) +koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o $(NOAVX2_BUILD) - -koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o +koboldcpp_openblas_noavx2: ggml_openblas_noavx2.o ggml_v2_openblas_noavx2.o ggml_v1_noavx2.o expose.o common.o gpttype_adapter.o $(OPENBLAS_NOAVX2_BUILD) - -koboldcpp_clblast: ggml_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml-opencl-legacy.o +koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(CLBLAST_BUILD) quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o diff --git a/expose.cpp b/expose.cpp index 22b1ebf61..104e71043 100644 --- a/expose.cpp +++ b/expose.cpp @@ -63,7 +63,7 @@ extern "C" putenv((char*)deviceenv.c_str()); executable_path = inputs.executable_path; - if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4) + if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2 || file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4 || file_format==FileFormat::GPTJ_5) { printf("\n---\nIdentified as GPT-J model: (ver %d)\nAttempting to Load...\n---\n", file_format); ModelLoadResult lr = gpttype_load_model(inputs, file_format); @@ -141,7 +141,7 @@ extern "C" return true; } } - else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) + else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5|| file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7) { printf("\n---\nIdentified as GPT-NEO-X model: (ver %d)\nAttempting to Load...\n---\n", file_format); ModelLoadResult lr = gpttype_load_model(inputs, file_format); diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index af9c7651f..35f95863f 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -6,6 +6,7 @@ #define CL_TARGET_OPENCL_VERSION 110 #include +#include #include #include @@ -16,55 +17,45 @@ #define CL_DMMV_BLOCK_SIZE 32; #define MULTILINE_QUOTE(...) #__VA_ARGS__ -std::string program_source = MULTILINE_QUOTE( +static std::string program_source = MULTILINE_QUOTE( typedef char int8_t; typedef uchar uint8_t; typedef int int32_t; typedef uint uint32_t; -constant uint QK4_0 = 32; -constant uint QR4_0 = 2; struct block_q4_0 { half d; - uint8_t qs[QK4_0 / 2]; + uint8_t qs[16]; }; -constant uint QK4_1 = 32; -constant uint QR4_1 = 2; struct block_q4_1 { half d; half m; - uint8_t qs[QK4_1 / 2]; + uint8_t qs[16]; }; -constant uint QK5_0 = 32; -constant uint QR5_0 = 2; struct __attribute__ ((packed)) block_q5_0 { half d; uint32_t qh; - uint8_t qs[QK5_0 / 2]; + uint8_t qs[16]; }; -constant uint QK5_1 = 32; -constant uint QR5_1 = 2; struct block_q5_1 { half d; half m; uint32_t qh; - uint8_t qs[QK5_1 / 2]; + uint8_t qs[16]; }; -constant uint QK8_0 = 32; -constant uint QR8_0 = 1; struct block_q8_0 { half d; - uint8_t qs[QK8_0]; + uint8_t qs[32]; }; @@ -135,13 +126,13 @@ void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const in *v0 = vi0*d; *v1 = vi1*d; } -void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ +static void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ *v0 = vload_half(0, &x[ib + 0]); *v1 = vload_half(0, &x[ib + 1]); } ); -std::string dequant_template = MULTILINE_QUOTE( +static std::string dequant_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2; @@ -165,7 +156,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { } ); -std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE( +static std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { const int block_size = get_local_size(0); const int row = get_global_id(0) / block_size; @@ -207,29 +198,29 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float } ); -std::array dequant_str_keys = { +static std::array dequant_str_keys = { "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC" }; -std::array dequant_str_values = { - "dequantize_row_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0", - "dequantize_row_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1", - "dequantize_row_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0", - "dequantize_row_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1", - "dequantize_row_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0", +static std::array dequant_str_values = { + "dequantize_row_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0", + "dequantize_row_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1", + "dequantize_row_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0", + "dequantize_row_q5_1", "struct block_q5_1", "32", "2", "dequantize_q5_1", + "dequantize_row_q8_0", "struct block_q8_0", "32", "1", "dequantize_q8_0", "convert_row_f16", "half", "1", "1", "convert_f16" }; -std::array dequant_mul_mat_vec_str_values = { - "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "QK4_0", "QR4_0", "dequantize_q4_0", - "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "QK4_1", "QR4_1", "dequantize_q4_1", - "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "QK5_0", "QR5_0", "dequantize_q5_0", - "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "QK5_1", "QR5_1", "dequantize_q5_1", - "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "QK8_0", "QR8_0", "dequantize_q8_0", +static std::array dequant_mul_mat_vec_str_values = { + "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0", + "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1", + "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0", + "dequantize_mul_mat_vec_q5_1", "struct block_q5_1", "32", "2", "dequantize_q5_1", + "dequantize_mul_mat_vec_q8_0", "struct block_q8_0", "32", "1", "dequantize_q8_0", "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16" }; -std::string& replace(std::string& s, const std::string& from, const std::string& to) { +static std::string& sreplace(std::string& s, const std::string& from, const std::string& to) { size_t pos = 0; while ((pos = s.find(from, pos)) != std::string::npos) { s.replace(pos, from.length(), to); @@ -238,15 +229,15 @@ std::string& replace(std::string& s, const std::string& from, const std::string& return s; } -std::string generate_kernels() { +static std::string generate_kernels() { std::stringstream src; src << program_source << '\n'; for (size_t i = 0; i < dequant_str_values.size(); i += dequant_str_keys.size()) { std::string dequant_kernel = dequant_template; std::string dmmv_kernel = dequant_mul_mat_vec_template; for (size_t j = 0; j < dequant_str_keys.size(); j++) { - replace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]); - replace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]); + sreplace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]); + sreplace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]); } src << dequant_kernel << '\n'; src << dmmv_kernel << '\n'; @@ -259,6 +250,7 @@ std::string generate_kernels() { cl_int err_ = (err); \ if (err_ != CL_SUCCESS) { \ fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \ + fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n"); \ exit(1); \ } \ } while (0) @@ -271,7 +263,7 @@ static cl_program program; static cl_kernel convert_row_f16_cl; static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; -static bool fp16_support; +static bool fp16_support = false; static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { cl_program p; @@ -339,6 +331,8 @@ void ggml_cl_init(void) { } free(ext_buffer); printf("Using Platform: %s Device: %s FP16: %d\n", platform_buffer, device_buffer, fp16_support); + fp16_support = false; + printf("CL FP16 temporarily disabled pending further optimization.\n"); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); CL_CHECK(err, "clCreateContext"); queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); @@ -552,17 +546,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr // compute cl_event ev_sgemm; - clblast::StatusCode status = clblast::Gemm(clblast::Layout::kColMajor, - clblast::Transpose::kYes, clblast::Transpose::kNo, - ne01, ne11, ne10, - alpha, - d_X, 0, ne00, - d_Y, 0, ne10, - beta, - d_D, 0, ne01, - &queue, &ev_sgemm); + clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor, + (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, + ne01, ne11, ne10, + alpha, + d_X, 0, ne00, + d_Y, 0, ne10, + beta, + d_D, 0, ne01, + &queue, &ev_sgemm); if (status != clblast::StatusCode::kSuccess) { + printf("\nF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status); GGML_ASSERT(false); } @@ -650,18 +645,19 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr // compute cl_event ev_sgemm; - clblast::StatusCode status = clblast::Gemm(clblast::Layout::kColMajor, - clblast::Transpose::kYes, clblast::Transpose::kNo, - ne01, ne11, ne10, - alpha, - d_X, 0, ne00, - d_Y, 0, ne10, - beta, - d_D, 0, ne01, - &queue, &ev_sgemm); + clblast::StatusCode status = (clblast::StatusCode)CLBlastHgemm((CLBlastLayout)clblast::Layout::kColMajor, + (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, + ne01, ne11, ne10, + alpha, + d_X, 0, ne00, + d_Y, 0, ne10, + beta, + d_D, 0, ne01, + &queue, &ev_sgemm); if (status != clblast::StatusCode::kSuccess) { - GGML_ASSERT(false); + printf("\nF16 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status); + GGML_ASSERT(false); } // copy dst to host, then convert to float @@ -757,17 +753,18 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * CL_CHECK(clFinish(queue), "clFinish"); // compute - clblast::StatusCode status = clblast::Gemm(clblast::Layout::kColMajor, - clblast::Transpose::kYes, clblast::Transpose::kNo, - ne01, ne11, ne10, - alpha, - d_X, 0, ne00, - d_Y, 0, ne10, - beta, - d_D, 0, ne01, - &queue, &ev_sgemm); + clblast::StatusCode status = (clblast::StatusCode)CLBlastSgemm((CLBlastLayout)clblast::Layout::kColMajor, + (CLBlastTranspose)clblast::Transpose::kYes, (CLBlastTranspose)clblast::Transpose::kNo, + ne01, ne11, ne10, + alpha, + d_X, 0, ne00, + d_Y, 0, ne10, + beta, + d_D, 0, ne01, + &queue, &ev_sgemm); if (status != clblast::StatusCode::kSuccess) { + printf("\nQF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status); GGML_ASSERT(false); } } diff --git a/ggml-opencl.h b/ggml-opencl.h index e69de29bb..7802b4fc4 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -0,0 +1,22 @@ +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void ggml_cl_init(void); + +bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); + +void * ggml_cl_host_malloc(size_t size); +void ggml_cl_host_free(void * ptr); + +void ggml_cl_transform_tensor(struct ggml_tensor * tensor); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/ggml.c b/ggml.c index 939ab4d62..ede115b34 100644 --- a/ggml.c +++ b/ggml.c @@ -512,7 +512,7 @@ static inline int hsum_i32_4(const __m128i a) { return _mm_cvtsi128_si32(_mm_add_epi32(sum64, hi32)); } -#if defined(__AVX2__) || defined(__AVX512F__) +#if __AVX2__ || __AVX512F__ // spread 32 bits to 32 bytes { 0x00, 0xFF } static inline __m256i bytes_from_bits_32(const uint8_t * x) { uint32_t x32; @@ -688,7 +688,7 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #endif // __AVX__ || __AVX2__ || __AVX512F__ #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) -#if defined(__ARM_NEON) +#if __ARM_NEON #if !defined(__aarch64__) @@ -2481,7 +2481,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]); } - sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; + sumf += (GGML_FP16_TO_FP32(x[i]).d*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s; } *s = sumf; @@ -9289,7 +9289,7 @@ static void ggml_compute_forward_rms_norm_back( // ggml_compute_forward_mul_mat -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // helper function to determine if it is better to use BLAS or not // for large matrices, BLAS is faster static bool ggml_compute_forward_mul_mat_use_blas( @@ -9330,7 +9330,7 @@ static void ggml_compute_forward_mul_mat_f32( const int64_t ne02 = src0->ne[2]; const int64_t ne03 = src0->ne[3]; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) const int64_t ne10 = src1->ne[0]; #endif const int64_t ne11 = src1->ne[1]; @@ -9394,9 +9394,16 @@ static void ggml_compute_forward_mul_mat_f32( } return; } +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } #endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -9416,21 +9423,11 @@ static void ggml_compute_forward_mul_mat_f32( const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); -#if defined(GGML_USE_CLBLAST) - // zT = y * xT - ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne10, - 0.0f, d, ne01, - GGML_TYPE_F32); -#else cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); -#endif } } //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); @@ -9569,9 +9566,16 @@ static void ggml_compute_forward_mul_mat_f16_f32( } return; } +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } #endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { GGML_ASSERT(nb10 == sizeof(float)); @@ -9601,20 +9605,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( assert(id*sizeof(float) <= params->wsize); } -#if defined(GGML_USE_CLBLAST) - const float * x = wdata; - const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); - - float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - - // zT = y * xT - ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne10, - 0.0f, d, ne01, - GGML_TYPE_F32); -#else const float * x = wdata; const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); @@ -9626,7 +9616,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); -#endif } } @@ -9789,9 +9778,16 @@ static void ggml_compute_forward_mul_mat_q_f32( } return; } +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(src0, src1, dst)) { + if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) { + ggml_cl_mul_mat(src0, src1, dst, params->wdata, params->wsize); + } + return; + } #endif -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (params->ith != 0) { return; @@ -9814,9 +9810,6 @@ static void ggml_compute_forward_mul_mat_q_f32( float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); -#if defined(GGML_USE_CLBLAST) - const void* x = (char *) src0->data + i03*nb03 + i02*nb02; -#else { size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { @@ -9828,23 +9821,12 @@ static void ggml_compute_forward_mul_mat_q_f32( } const float * x = wdata; -#endif -#if defined(GGML_USE_CLBLAST) - // zT = y * xT - ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T, - ne11, ne01, ne10, - 1.0f, y, ne10, - x, ne10, - 0.0f, d, ne01, - type); -#else cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, ne11, ne01, ne10, 1.0f, y, ne10, x, ne00, 0.0f, d, ne01); -#endif } } @@ -13941,9 +13923,16 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node); } else +#elif defined(GGML_USE_CLBLAST) + if (ggml_cl_can_mul_mat(node->src0, node->src1, node)) { + node->n_tasks = 1; // TODO: this actually is doing nothing + // the threads are still spinning + cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node); + } + else #endif if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning @@ -13957,13 +13946,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) #endif } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; } #endif } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); @@ -15468,4 +15457,4 @@ int ggml_cpu_has_vsx(void) { #endif } -//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// \ No newline at end of file diff --git a/ggml.h b/ggml.h index dce5ca1e7..6fcc78153 100644 --- a/ggml.h +++ b/ggml.h @@ -249,6 +249,7 @@ extern "C" { enum ggml_backend { GGML_BACKEND_CPU = 0, GGML_BACKEND_CUDA = 1, + GGML_BACKEND_CL = 2, }; // model file types diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index feabb1848..ee5658b38 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -12,30 +12,42 @@ #include "otherarch.h" //for easier compilation -#include "llama_v2.cpp" - //concat source files into one file for compilation purposes +#include "llama_v2.cpp" +#include "llama.cpp" #include "utils.cpp" #include "gptj_v1.cpp" #include "gptj_v2.cpp" +#include "gptj_v3.cpp" #include "gpt2_v1.cpp" #include "gpt2_v2.cpp" +#include "gpt2_v3.cpp" #include "rwkv_v2.cpp" #include "neox_v2.cpp" +#include "neox_v3.cpp" //return val: 0=fail, 1=(original ggml, alpaca), 2=(ggmf), 3=(ggjt) static FileFormat file_format = FileFormat::BADFORMAT; static gpt_vocab vocab; -static gptj_model_v1 gptj_ctx_v1; -static gptj_model gptj_ctx_v2; + +static gptj_v1_model gptj_ctx_v1; +static gptj_v2_model gptj_ctx_v2; +static gptj_model gptj_ctx_v3; + static gpt2_v1_model gpt2_ctx_v1; -static gpt2_model gpt2_ctx_v2; -static gpt_neox_model neox_ctx; +static gpt2_v2_model gpt2_ctx_v2; +static gpt2_model gpt2_ctx_v3; + +static gpt_neox_v2_model neox_ctx_v2; +static gpt_neox_model neox_ctx_v3; + static rwkv_context * rwkv_ctx_v1; +static llama_v2_context_params llama_ctx_params_v2; static llama_context_params llama_ctx_params; -static llama_context * llama_ctx_v1; +static llama_v2_context * llama_ctx_v2; +static llama_context * llama_ctx_v3; static gpt_params params; static int n_past = 0; @@ -81,7 +93,6 @@ inline bool LogitsDuplicated(std::vector & arr1, std::vector & arr llama_token sample_token(llama_token_data_array * candidates, std::mt19937 & rng) { - const int64_t t_start_sample_us = ggml_time_us(); llama_sample_softmax(nullptr, candidates); std::vector probs; probs.reserve(candidates->size); @@ -220,7 +231,9 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in blasbatchsize = inputs.blasbatchsize; params.memory_f16 = inputs.f16_kv; params.n_ctx = inputs.max_context_length; - neox_ctx.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = params.n_ctx; + + neox_ctx_v2.hparams.n_ctx = gptj_ctx_v1.hparams.n_ctx = gptj_ctx_v2.hparams.n_ctx = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx + = neox_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = gptj_ctx_v3.hparams.n_ctx = params.n_ctx; printf("System Info: %s\n", llama_print_system_info()); SetQuantsUnshuffled(false); @@ -229,33 +242,31 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in //newer format has bit unshuffling SetQuantsUnshuffled(file_format == FileFormat::GGJT_2); - llama_ctx_params = llama_context_default_params(); - llama_ctx_params.n_ctx = inputs.max_context_length; + llama_ctx_params_v2 = llama_v2_context_default_params(); + llama_ctx_params_v2.n_ctx = inputs.max_context_length; //llama_ctx_params.n_parts = -1; - llama_ctx_params.seed = -1; - llama_ctx_params.f16_kv = inputs.f16_kv; - llama_ctx_params.logits_all = false; - llama_ctx_params.use_mmap = inputs.use_mmap; - llama_ctx_params.use_mlock = inputs.use_mlock; - llama_ctx_params.n_gpu_layers = inputs.gpulayers; + llama_ctx_params_v2.seed = -1; + llama_ctx_params_v2.f16_kv = inputs.f16_kv; + llama_ctx_params_v2.logits_all = false; + llama_ctx_params_v2.use_mmap = inputs.use_mmap; + llama_ctx_params_v2.use_mlock = inputs.use_mlock; + llama_ctx_params_v2.n_gpu_layers = inputs.gpulayers; - llama_ctx_v1 = llama_init_from_file(modelname.c_str(), llama_ctx_params); + llama_ctx_v2 = llama_v2_init_from_file(modelname.c_str(), llama_ctx_params_v2); - if (llama_ctx_v1 == NULL) + if (llama_ctx_v2 == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str()); return ModelLoadResult::FAIL; } - if (file_format < FileFormat::GGJT_2) - { - printf("\n---\nWarning: Your model may be an OUTDATED format (ver %d). Please reconvert it for better results!\n---\n", file_format); - } - + + printf("\n---\nWarning: Your model may be an OUTDATED format (ver %d). Please reconvert it for better results!\n---\n", file_format); + if (lora_filename != "") { printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str()); - int err = llama_apply_lora_from_file(llama_ctx_v1, + int err = llama_v2_apply_lora_from_file(llama_ctx_v2, lora_filename.c_str(), NULL, n_threads); @@ -268,9 +279,47 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in //determine mem per token const std::vector tmp = {1, 2, 3, 4}; - llama_eval(llama_ctx_v1, tmp.data(), tmp.size(), 0, params.n_threads); + llama_v2_eval(llama_ctx_v2, tmp.data(), tmp.size(), 0, params.n_threads); return ModelLoadResult::SUCCESS; + } + else if(file_format == FileFormat::GGJT_3) + { + llama_ctx_params = llama_context_default_params(); + llama_ctx_params.n_ctx = inputs.max_context_length; + //llama_ctx_paran_parts = -1; + llama_ctx_params.seed = -1; + llama_ctx_params.f16_kv = inputs.f16_kv; + llama_ctx_params.logits_all = false; + llama_ctx_params.use_mmap = inputs.use_mmap; + llama_ctx_params.use_mlock = inputs.use_mlock; + llama_ctx_params.n_gpu_layers = inputs.gpulayers; + + llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params); + + if (llama_ctx_v3 == NULL) + { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, modelname.c_str()); + return ModelLoadResult::FAIL; + } + if (lora_filename != "") + { + printf("\nAttempting to apply LORA adapter: %s\n", lora_filename.c_str()); + + int err = llama_apply_lora_from_file(llama_ctx_v3, + lora_filename.c_str(), + NULL, + n_threads); + if (err != 0) + { + fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); + return ModelLoadResult::FAIL; + } + } + //determine mem per token + const std::vector tmp = {1, 2, 3, 4}; + llama_eval(llama_ctx_v3, tmp.data(), tmp.size(), 0, params.n_threads); + return ModelLoadResult::SUCCESS; } else if (file_format == FileFormat::RWKV_1) { @@ -329,25 +378,45 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in legacy_gpt2_eval(gpt2_ctx_v1, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); return ModelLoadResult::SUCCESS; } - else if (file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3) + else if (file_format == FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3 || file_format==FileFormat::GPT2_4) { - //newer format has bit unshuffling - SetQuantsUnshuffled(file_format == FileFormat::GPT2_3); - - ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v2, vocab, file_format, inputs.gpulayers); - if(res==ModelLoadResult::FAIL) + if(file_format==FileFormat::GPT2_4) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return res; - } - else if(res==ModelLoadResult::RETRY_LOAD) - { - printf("\nTensor Transposition Detected! Retrying GPT-2 model loading..."); - return res; + ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v3, vocab, file_format, inputs.gpulayers); + if(res==ModelLoadResult::FAIL) + { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return res; + } + else if(res==ModelLoadResult::RETRY_LOAD) + { + printf("\nTensor Transposition Detected! Retrying GPT-2 model loading..."); + return res; + } + // determine the required inference memory per token: + gpt2_eval(gpt2_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); + return ModelLoadResult::SUCCESS; + } + else + { + //newer format has bit unshuffling + SetQuantsUnshuffled(file_format == FileFormat::GPT2_3); + + ModelLoadResult res = gpt2_v2_model_load(params.model, gpt2_ctx_v2, vocab, file_format, inputs.gpulayers); + if(res==ModelLoadResult::FAIL) + { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return res; + } + else if(res==ModelLoadResult::RETRY_LOAD) + { + printf("\nTensor Transposition Detected! Retrying GPT-2 model loading..."); + return res; + } + // determine the required inference memory per token: + gpt2_v2_eval(gpt2_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); + return ModelLoadResult::SUCCESS; } - // determine the required inference memory per token: - gpt2_eval(gpt2_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, file_format); - return ModelLoadResult::SUCCESS; } else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2) { @@ -375,82 +444,146 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in return ModelLoadResult::SUCCESS; } - else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) + else if(file_format == FileFormat::GPTJ_3 || file_format == FileFormat::GPTJ_4 || file_format == FileFormat::GPTJ_5) { - //newer format has bit unshuffling - SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5); - - ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx, vocab, file_format); - if(res==ModelLoadResult::FAIL) + if(file_format == FileFormat::GPTJ_5) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return res; - } - else if(res==ModelLoadResult::RETRY_LOAD) - { - printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading..."); - return res; - } - - // determine the required inference memory per token: - gpt_neox_eval(neox_ctx, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); - - if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0])) - { - //run the black magic eval to determine if it's redpajama. VERY UGLY HACK! - std::vector test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7"); - auto orig_par_res = neox_ctx.hparams.par_res; - neox_ctx.hparams.par_res = 0; //test with residual false - gpt_neox_eval(neox_ctx, params.n_threads, 0, test_embd, logits, mem_per_token); - neox_ctx.hparams.par_res = orig_par_res; - int topid = std::max_element(logits.begin(),logits.end())-logits.begin(); - std::string predicted = vocab.id_to_token[topid].c_str(); - auto findresult = predicted.find("8"); - if(findresult != std::string::npos && findresult<2) + ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v3, vocab, inputs.gpulayers); + if (loadresult == ModelLoadResult::FAIL) { - printf("\n---\nOld RedPajama NeoX Detected! Switching to new format! (use_parallel_residual=False)\n"); - ggml_free(neox_ctx.ctx); + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return loadresult; + } + else if (loadresult == ModelLoadResult::RETRY_LOAD) + { + printf("\nTensor Transposition Detected! Retrying GPT-J model loading..."); + return loadresult; + } + + // determine the required inference memory per token: + gptj_eval(gptj_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + + //if the logits are NAN or duplicated, it means the model is incompatible + std::vector oldlogits(logits); + + //this is another hack because they change the library - we run the eval through the model + //twice and compare logits. if they give the same logits for different inputs, model is broken + gptj_eval(gptj_ctx_v3, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token); + + if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits))) + { + printf("\nBad Logits detected! Retrying GPT-J model loading..."); + ggml_free(gptj_ctx_v3.ctx); return ModelLoadResult::RETRY_LOAD; } - } - return ModelLoadResult::SUCCESS; + return ModelLoadResult::SUCCESS; + } + else + { + //newer format has bit unshuffling + SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4); + + ModelLoadResult loadresult = gptj_v2_model_load(params.model, gptj_ctx_v2, vocab, inputs.gpulayers); + if (loadresult == ModelLoadResult::FAIL) + { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return loadresult; + } + else if (loadresult == ModelLoadResult::RETRY_LOAD) + { + printf("\nTensor Transposition Detected! Retrying GPT-J model loading..."); + return loadresult; + } + + // determine the required inference memory per token: + gptj_v2_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + + //if the logits are NAN or duplicated, it means the model is incompatible + std::vector oldlogits(logits); + + //this is another hack because they change the library - we run the eval through the model + //twice and compare logits. if they give the same logits for different inputs, model is broken + gptj_v2_eval(gptj_ctx_v2, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token); + + if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits))) + { + printf("\nBad Logits detected! Retrying GPT-J model loading..."); + ggml_v2_free(gptj_ctx_v2.ctx); + return ModelLoadResult::RETRY_LOAD; + } + + return ModelLoadResult::SUCCESS; + } + } + else if(file_format==FileFormat::NEOX_1 || file_format==FileFormat::NEOX_2 || file_format==FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5|| file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7) + { + if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7) + { + ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format); + if(res==ModelLoadResult::FAIL) + { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return res; + } + else if(res==ModelLoadResult::RETRY_LOAD) + { + printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading..."); + return res; + } + + // determine the required inference memory per token: + gpt_neox_eval(neox_ctx_v3, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + + return ModelLoadResult::SUCCESS; + } + else + { + //newer format has bit unshuffling + SetQuantsUnshuffled(file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5); + + ModelLoadResult res = gpt_neox_v2_model_load(params.model, neox_ctx_v2, vocab, file_format); + if(res==ModelLoadResult::FAIL) + { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return res; + } + else if(res==ModelLoadResult::RETRY_LOAD) + { + printf("\nIncorrect Tensor Size Detected! Retrying GPT-NeoX model loading..."); + return res; + } + + // determine the required inference memory per token: + gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); + + if(logits.size()>0 && file_format==FileFormat::NEOX_2 && !IsNanCheck(logits[0])) + { + //run the black magic eval to determine if it's redpajama. VERY UGLY HACK! + std::vector test_embd = ::gpt_tokenize(vocab, "1 2 3 4 5 6 7"); + auto orig_par_res = neox_ctx_v2.hparams.par_res; + neox_ctx_v2.hparams.par_res = 0; //test with residual false + gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, 0, test_embd, logits, mem_per_token); + neox_ctx_v2.hparams.par_res = orig_par_res; + int topid = std::max_element(logits.begin(),logits.end())-logits.begin(); + std::string predicted = vocab.id_to_token[topid].c_str(); + auto findresult = predicted.find("8"); + if(findresult != std::string::npos && findresult<2) + { + printf("\n---\nOld RedPajama NeoX Detected! Switching to new format! (use_parallel_residual=False)\n"); + ggml_v2_free(neox_ctx_v2.ctx); + return ModelLoadResult::RETRY_LOAD; + } + } + + return ModelLoadResult::SUCCESS; + } + } else { - //newer format has bit unshuffling - SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4); - - ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v2, vocab, inputs.gpulayers); - if (loadresult == ModelLoadResult::FAIL) - { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return loadresult; - } - else if (loadresult == ModelLoadResult::RETRY_LOAD) - { - printf("\nTensor Transposition Detected! Retrying GPT-J model loading..."); - return loadresult; - } - - // determine the required inference memory per token: - gptj_eval(gptj_ctx_v2, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token); - - //if the logits are NAN or duplicated, it means the model is incompatible - std::vector oldlogits(logits); - - //this is another hack because they change the library - we run the eval through the model - //twice and compare logits. if they give the same logits for different inputs, model is broken - gptj_eval(gptj_ctx_v2, params.n_threads, 0, {4, 5, 6, 7}, logits, mem_per_token); - - if(logits.size()>0 && (IsNanCheck(logits[0]) || LogitsDuplicated(oldlogits,logits))) - { - printf("\nBad Logits detected! Retrying GPT-J model loading..."); - ggml_free(gptj_ctx_v2.ctx); - return ModelLoadResult::RETRY_LOAD; - } - - return ModelLoadResult::SUCCESS; + printf("\nUnknown Model, cannot load.\n"); + return ModelLoadResult::FAIL; } } @@ -501,16 +634,20 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o // tokenize the prompt std::vector embd_inp; - if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2) + if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3) { params.prompt.insert(0, 1, ' '); - if (file_format == FileFormat::GGML) + if(file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 ) { - embd_inp = ::legacy_llama_tokenize(llama_ctx_v1, params.prompt, true); + embd_inp = ::llama_v2_tokenize(llama_ctx_v2, params.prompt, true); + } + else if (file_format == FileFormat::GGML) + { + embd_inp = ::legacy_llama_v2_tokenize(llama_ctx_v2, params.prompt, true); } else { - embd_inp = ::llama_tokenize(llama_ctx_v1, params.prompt, true); + embd_inp = ::llama_tokenize(llama_ctx_v3, params.prompt, true); } } else @@ -560,7 +697,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { //for non llama, limit to 256 int bbs = blasbatchsize; - if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2) + if (file_format != FileFormat::GGML && file_format != FileFormat::GGHF && file_format != FileFormat::GGJT && file_format != FileFormat::GGJT_2 && file_format != FileFormat::GGJT_3) { bbs = (blasbatchsize > 256 ? 256 : blasbatchsize); } @@ -592,7 +729,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2) { - n_vocab = llama_n_vocab(llama_ctx_v1); + n_vocab = llama_v2_n_vocab(llama_ctx_v2); + } + else if(file_format == FileFormat::GGJT_3) + { + n_vocab = llama_n_vocab(llama_ctx_v3); } else if (file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2) { @@ -602,6 +743,10 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { n_vocab = gptj_ctx_v2.hparams.n_vocab; } + else if(file_format==FileFormat::GPTJ_5) + { + n_vocab = gptj_ctx_v3.hparams.n_vocab; + } else if(file_format == FileFormat::GPT2_1) { n_vocab = gpt2_ctx_v1.hparams.n_vocab; @@ -610,9 +755,17 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o { n_vocab = gpt2_ctx_v2.hparams.n_vocab; } + else if(file_format==FileFormat::GPT2_4) + { + n_vocab = gpt2_ctx_v3.hparams.n_vocab; + } else if(file_format == FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) { - n_vocab = neox_ctx.hparams.n_vocab; + n_vocab = neox_ctx_v2.hparams.n_vocab; + } + else if( file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7) + { + n_vocab = neox_ctx_v3.hparams.n_vocab; } else if(file_format == FileFormat::RWKV_1) { @@ -641,11 +794,18 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o if(debugmode) { printf("\n[Debug: Dump Input Tokens]\n"); - if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2) + if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3) { for (auto id : embd_inp) { - printf("'%s (%d)', ",llama_token_to_str(llama_ctx_v1, id),id); + printf("'%s (%d)', ",llama_v2_token_to_str(llama_ctx_v2, id),id); + } + } + else if (file_format == FileFormat::GGJT_3) + { + for (auto id : embd_inp) + { + printf("'%s (%d)', ",llama_token_to_str(llama_ctx_v3, id),id); } } else @@ -680,7 +840,11 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2) { - evalres = (llama_eval(llama_ctx_v1, embd.data(), embdsize, n_past, params.n_threads)==0); + evalres = (llama_v2_eval(llama_ctx_v2, embd.data(), embdsize, n_past, params.n_threads)==0); + } + else if(file_format == FileFormat::GGJT_3) + { + evalres = (llama_eval(llama_ctx_v3, embd.data(), embdsize, n_past, params.n_threads)==0); } else if(file_format==FileFormat::RWKV_1) { @@ -694,20 +858,37 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o } else if(file_format==FileFormat::GPT2_2 || file_format==FileFormat::GPT2_3) { - evalres = gpt2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format); + evalres = gpt2_v2_eval(gpt2_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token, file_format); + } + else if(file_format==FileFormat::GPT2_4) + { + evalres = gpt2_eval(gpt2_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token, file_format); } else if(file_format==FileFormat::NEOX_1 || file_format == FileFormat::NEOX_2 || file_format == FileFormat::NEOX_3 || file_format==FileFormat::NEOX_4 || file_format==FileFormat::NEOX_5) { - evalres = gpt_neox_eval(neox_ctx, params.n_threads, n_past, embd, logits, mem_per_token); + evalres = gpt_neox_v2_eval(neox_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token); + } + else if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7) + { + evalres = gpt_neox_eval(neox_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token); } else if(file_format==FileFormat::GPTJ_1 || file_format==FileFormat::GPTJ_2) { evalres = legacy_gptj_eval(gptj_ctx_v1, params.n_threads, n_past, embd, logits, mem_per_token, file_format); } + else if(file_format==FileFormat::GPTJ_3 || file_format==FileFormat::GPTJ_4) + { + evalres = gptj_v2_eval(gptj_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token); + } + else if(file_format==FileFormat::GPTJ_5) + { + evalres = gptj_eval(gptj_ctx_v3, params.n_threads, n_past, embd, logits, mem_per_token); + } else { - evalres = gptj_eval(gptj_ctx_v2, params.n_threads, n_past, embd, logits, mem_per_token); + printf("\nCannot find eval function\n"); } + if (!evalres) { fprintf(stderr, "Failed to predict\n"); @@ -739,9 +920,17 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o printf("\n"); } - if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2) + if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2 || file_format == FileFormat::GGJT_3) { - auto logits = llama_get_logits(llama_ctx_v1); + float * logits; + if(file_format == FileFormat::GGJT_3) + { + logits = llama_get_logits(llama_ctx_v3); + } + else + { + logits = llama_v2_get_logits(llama_ctx_v2); + } if (!unbanTokens) { @@ -765,10 +954,12 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o if ((file_format == FileFormat::GPT2_1 || file_format == FileFormat::GPT2_2 || file_format == FileFormat::GPT2_3 || + file_format == FileFormat::GPT2_4 || file_format == FileFormat::GPTJ_1 || file_format == FileFormat::GPTJ_2 || file_format == FileFormat::GPTJ_3 || - file_format == FileFormat::GPTJ_4) && + file_format == FileFormat::GPTJ_4 || + file_format == FileFormat::GPTJ_5) && logits.size() > 50256) { logits[50256] = (logits[50256] < 0 ? logits[50256] : 0); @@ -793,7 +984,15 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o if (file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2) { - concat_output += llama_token_to_str(llama_ctx_v1, id); + if(file_format == FileFormat::GGJT_3) + { + concat_output += llama_token_to_str(llama_ctx_v3, id); + } + else + { + concat_output += llama_v2_token_to_str(llama_ctx_v2, id); + } + if(unbanTokens && id==llama_token_eos()) { printf("\n(EOS token triggered!)"); diff --git a/koboldcpp.py b/koboldcpp.py index c9990081e..894fbbfc0 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -200,7 +200,7 @@ maxctx = 2048 maxlen = 128 modelbusy = False defaultport = 5001 -KcppVersion = "1.23.1" +KcppVersion = "1.24" class ServerRequestHandler(http.server.SimpleHTTPRequestHandler): sys_version = "" diff --git a/llama.cpp b/llama.cpp index dd449592a..745b95823 100644 --- a/llama.cpp +++ b/llama.cpp @@ -11,6 +11,8 @@ #include "ggml.h" #ifdef GGML_USE_CUBLAS #include "ggml-cuda.h" +#elif defined(GGML_USE_CLBLAST) +#include "ggml-opencl.h" #endif #include @@ -91,7 +93,7 @@ static const std::map & MEM_REQ_KV_SELF() static const std::map & MEM_REQ_EVAL() { static std::map k_sizes = { - { MODEL_7B, 768ull * MB }, + { MODEL_7B, 800ull * MB }, { MODEL_13B, 1024ull * MB }, { MODEL_30B, 1280ull * MB }, { MODEL_65B, 1536ull * MB }, @@ -933,7 +935,7 @@ static void llama_model_load_internal( if (hparams.ftype != LLAMA_FTYPE_ALL_F32 && hparams.ftype != LLAMA_FTYPE_MOSTLY_F16 && hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) { - throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"); + printf("\nthis format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)"); } } @@ -941,7 +943,7 @@ static void llama_model_load_internal( if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 || hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) { - throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"); + printf("\nthis format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)"); } } @@ -1039,7 +1041,7 @@ static void llama_model_load_internal( ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); model.mapping = std::move(ml->mapping); -#ifdef GGML_USE_CUBLAS +#if defined(GGML_USE_CUBLAS) { const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); @@ -1065,6 +1067,32 @@ static void llama_model_load_internal( fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); } +#elif defined(GGML_USE_CLBLAST) + { + const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); + + fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + + size_t vram_total = 0; + + for (int i = 0; i < n_gpu; ++i) { + const auto & layer = model.layers[i]; + + ggml_cl_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq); + ggml_cl_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk); + ggml_cl_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv); + ggml_cl_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo); + ggml_cl_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1); + ggml_cl_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2); + ggml_cl_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3); + } + if (n_gpu_layers > (int) hparams.n_layer) { + fprintf(stderr, "%s: [opencl] offloading output layer to GPU\n", __func__); + ggml_cl_transform_tensor(model.output); vram_total += ggml_nbytes(model.output); + } + + fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); + } #else (void) n_gpu_layers; #endif diff --git a/model_adapter.cpp b/model_adapter.cpp index 75b115bfc..5612da902 100644 --- a/model_adapter.cpp +++ b/model_adapter.cpp @@ -112,7 +112,14 @@ void print_tok_vec(std::vector &embd) temp %= 1000; if (qntvr != 0) { - fileformat = FileFormat::GPTJ_4; + if (qntvr == 1) + { + fileformat = FileFormat::GPTJ_4; + } + else + { + fileformat = FileFormat::GPTJ_5; + } } else if (temp != 0 && temp != 1) { @@ -131,8 +138,15 @@ void print_tok_vec(std::vector &embd) const int32_t qntvr = temp / 1000; temp %= 1000; if (qntvr != 0) - { - fileformat = FileFormat::GPT2_3; + { + if (qntvr == 1) + { + fileformat = FileFormat::GPT2_3; + } + else + { + fileformat = FileFormat::GPT2_4; + } } else if (temp != 0 && temp != 1) { @@ -142,7 +156,7 @@ void print_tok_vec(std::vector &embd) else if(vocabsiz < 31998 || vocabsiz > 33000) { //anything outside the llama v1 range is assumed to be NeoX - fileformat = FileFormat::NEOX_4; + fileformat = FileFormat::NEOX_6; uint32_t temp,temp2; fin.read((char *)&temp, sizeof(temp)); //ctx fin.read((char *)&temp, sizeof(temp)); //n_embd @@ -169,17 +183,21 @@ void print_tok_vec(std::vector &embd) if((temp==0||temp==1)&&(temp2==0||temp2==1))//special case: par_res and ftype are both 1 or 0 { //its a f16/f32 model in the new format - fileformat = FileFormat::NEOX_4; + fileformat = temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6; } } else { const int32_t qntvr = temp2 / 1000; //for future use - //then temp was par_res - if(temp==0) //use_parallel_residual is false in RedPajama + //then temp was par_res, use_parallel_residual is false in RedPajama + if(qntvr==1) { - fileformat = FileFormat::NEOX_5; + fileformat = (temp==0?FileFormat::NEOX_5:FileFormat::NEOX_4); } + else + { + fileformat = (temp==0?FileFormat::NEOX_7:FileFormat::NEOX_6); + } } } @@ -197,7 +215,7 @@ void print_tok_vec(std::vector &embd) } else if(magic == 0x67676a74) //v3 format ggjt { - fileformat = FileFormat::GGJT_2; //ggjt by default + fileformat = FileFormat::GGJT_3; //ggjt by default uint32_t ver, temp, ftype; fin.read((char *)&ver, sizeof(ver)); //file version fin.read((char *)&temp, sizeof(temp));//vocab @@ -212,6 +230,10 @@ void print_tok_vec(std::vector &embd) { fileformat = FileFormat::GGJT; } + else if(ver==2) + { + fileformat = FileFormat::GGJT_2; + } } fin.close(); diff --git a/model_adapter.h b/model_adapter.h index c6eb41582..4b418d4d8 100644 --- a/model_adapter.h +++ b/model_adapter.h @@ -20,15 +20,18 @@ enum FileFormat GGHF=2, // 2=(llama ggmf) GGJT=3, // 3=(llama ggjt) GGJT_2=4, //newer llama format unshuffled + GGJT_3=5, //using 16bit scalar GPTJ_1=100, //the very first super old GPTJ format GPTJ_2=101, //pygmalion, uses old ggml lib GPTJ_3=102, //uses new ggml lib GPTJ_4=103, //unshuffled + GPTJ_5=104, //using 16bit scalar GPT2_1=200, GPT2_2=201, GPT2_3=202, //unshuffled + GPT2_4=203, //using 16bit scalar RWKV_1=300, @@ -37,6 +40,8 @@ enum FileFormat NEOX_3=402, //redpajama NEOX_4=403, //unshuffled NEOX_5=404, //unshuffled redpajama + NEOX_6=405, //using 16bit scalar + NEOX_7=406, //using 16bit scalar redpajama }; enum ModelLoadResult diff --git a/otherarch/ggml_v2-opencl.cpp b/otherarch/ggml_v2-opencl.cpp index 23afdb34c..48d2d7f59 100644 --- a/otherarch/ggml_v2-opencl.cpp +++ b/otherarch/ggml_v2-opencl.cpp @@ -12,12 +12,12 @@ #include #include -#include "ggml.h" +#include "ggml_v2.h" #define CL_DMMV_BLOCK_SIZE 32; #define MULTILINE_QUOTE(...) #__VA_ARGS__ -std::string program_source = MULTILINE_QUOTE( +static std::string program_source = MULTILINE_QUOTE( typedef char int8_t; typedef uchar uint8_t; @@ -126,13 +126,13 @@ void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const in *v0 = vi0*d; *v1 = vi1*d; } -void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ +static void convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ *v0 = vload_half(0, &x[ib + 0]); *v1 = vload_half(0, &x[ib + 1]); } ); -std::string dequant_template = MULTILINE_QUOTE( +static std::string dequant_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { const int i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2; @@ -156,7 +156,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { } ); -std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE( +static std::string dequant_mul_mat_vec_template = MULTILINE_QUOTE( __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { const int block_size = get_local_size(0); const int row = get_global_id(0) / block_size; @@ -198,11 +198,11 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float } ); -std::array dequant_str_keys = { +static std::array dequant_str_keys = { "KERNEL_NAME", "X_TYPE", "QUANT_K", "QUANT_R", "DEQUANT_FUNC" }; -std::array dequant_str_values = { +static std::array dequant_str_values = { "dequantize_row_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0", "dequantize_row_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1", "dequantize_row_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0", @@ -211,7 +211,7 @@ std::array dequant_str_values = { "convert_row_f16", "half", "1", "1", "convert_f16" }; -std::array dequant_mul_mat_vec_str_values = { +static std::array dequant_mul_mat_vec_str_values = { "dequantize_mul_mat_vec_q4_0", "struct block_q4_0", "32", "2", "dequantize_q4_0", "dequantize_mul_mat_vec_q4_1", "struct block_q4_1", "32", "2", "dequantize_q4_1", "dequantize_mul_mat_vec_q5_0", "struct block_q5_0", "32", "2", "dequantize_q5_0", @@ -220,7 +220,7 @@ std::array dequant_mul_mat_vec_str_values = { "convert_mul_mat_vec_f16", "half", "1", "1", "convert_f16" }; -static std::string& sreplace(std::string& s, const std::string& from, const std::string& to) { +static std::string& sreplace2(std::string& s, const std::string& from, const std::string& to) { size_t pos = 0; while ((pos = s.find(from, pos)) != std::string::npos) { s.replace(pos, from.length(), to); @@ -236,8 +236,8 @@ static std::string generate_kernels() { std::string dequant_kernel = dequant_template; std::string dmmv_kernel = dequant_mul_mat_vec_template; for (size_t j = 0; j < dequant_str_keys.size(); j++) { - sreplace(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]); - sreplace(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]); + sreplace2(dequant_kernel, dequant_str_keys[j], dequant_str_values[i + j]); + sreplace2(dmmv_kernel, dequant_str_keys[j], dequant_mul_mat_vec_str_values[i + j]); } src << dequant_kernel << '\n'; src << dmmv_kernel << '\n'; @@ -296,13 +296,13 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co return p; } -void ggml_cl_init(void) { +void ggml_v2_cl_init(void) { cl_int err = 0; - char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM"); - char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE"); - int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM)); - int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE)); - printf("\nInitializing CLBlast (First Run)..."); + char * GGML_V2_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM"); + char * GGML_V2_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE"); + int plat_num = (GGML_V2_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_V2_CLBLAST_PLATFORM)); + int dev_num = (GGML_V2_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_V2_CLBLAST_DEVICE)); + printf("\nInitializing LEGACY v2 CLBlast (First Run)..."); printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num); cl_uint num_platforms; clGetPlatformIDs(0, NULL, &num_platforms); @@ -376,7 +376,7 @@ void ggml_cl_init(void) { CL_CHECK(err, "clCreateKernel"); } -static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { +static void ggml_v2_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) { if (req_size <= *cur_size) { return; } @@ -391,38 +391,38 @@ static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags CL_CHECK(err, "clCreateBuffer"); } -static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { +static cl_kernel* ggml_v2_get_to_fp32_cl(ggml_v2_type type) { switch (type) { - case GGML_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_0: return &dequantize_row_q4_0_cl; - case GGML_TYPE_Q4_1: + case GGML_V2_TYPE_Q4_1: return &dequantize_row_q4_1_cl; - case GGML_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_0: return &dequantize_row_q5_0_cl; - case GGML_TYPE_Q5_1: + case GGML_V2_TYPE_Q5_1: return &dequantize_row_q5_1_cl; - case GGML_TYPE_Q8_0: + case GGML_V2_TYPE_Q8_0: return &dequantize_row_q8_0_cl; - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: return &convert_row_f16_cl; default: return nullptr; } } -static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) { +static cl_kernel* ggml_v2_get_dequantize_mul_mat_vec_cl(ggml_v2_type type) { switch (type) { - case GGML_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_0: return &dequantize_mul_mat_vec_q4_0_cl; - case GGML_TYPE_Q4_1: + case GGML_V2_TYPE_Q4_1: return &dequantize_mul_mat_vec_q4_1_cl; - case GGML_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_0: return &dequantize_mul_mat_vec_q5_0_cl; - case GGML_TYPE_Q5_1: + case GGML_V2_TYPE_Q5_1: return &dequantize_mul_mat_vec_q5_1_cl; - case GGML_TYPE_Q8_0: + case GGML_V2_TYPE_Q8_0: return &dequantize_mul_mat_vec_q8_0_cl; - case GGML_TYPE_F16: + case GGML_V2_TYPE_F16: return &convert_mul_mat_vec_f16_cl; default: return nullptr; @@ -454,7 +454,7 @@ struct cl_buffer { static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS]; static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT; -static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) { +static cl_mem ggml_v2_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flags flags) { scoped_spin_lock lock(g_cl_pool_lock); cl_int err; @@ -473,7 +473,7 @@ static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size, cl_mem_flag return mem; } -static void ggml_cl_pool_free(cl_mem mem, size_t size) { +static void ggml_v2_cl_pool_free(cl_mem mem, size_t size) { scoped_spin_lock lock(g_cl_pool_lock); for (int i = 0; i < MAX_CL_BUFFERS; ++i) { @@ -488,7 +488,7 @@ static void ggml_cl_pool_free(cl_mem mem, size_t size) { clReleaseMemObject(mem); } -static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) { +static cl_int ggml_v2_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_v2_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) { cl_int err; const uint64_t ne0 = src->ne[0]; const uint64_t ne1 = src->ne[1]; @@ -496,9 +496,9 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o const uint64_t nb1 = src->nb[1]; const uint64_t nb2 = src->nb[2]; const uint64_t nb3 = src->nb[3]; - const enum ggml_type type = src->type; - const size_t ts = ggml_type_size(type); - const size_t bs = ggml_blck_size(type); + const enum ggml_v2_type type = src->type; + const size_t ts = ggml_v2_type_size(type); + const size_t bs = ggml_v2_blck_size(type); const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3); if (nb0 == ts && nb1 == ts*ne0/bs) { @@ -525,7 +525,7 @@ static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t o return err; } -static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_v2_cl_mul_mat_f32(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst) { const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; @@ -544,18 +544,18 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr const int d_ne = ne11 * ne01; size_t x_size, y_size, d_size; - cl_mem d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_ONLY); - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); - cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); + cl_mem d_X = ggml_v2_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_ONLY); + cl_mem d_Y = ggml_v2_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); + cl_mem d_D = ggml_v2_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); cl_int err; for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { // copy data to device - err = ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL); - err |= ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL); - CL_CHECK(err, "ggml_cl_h2d_tensor_2d"); + err = ggml_v2_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL); + err |= ggml_v2_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL); + CL_CHECK(err, "ggml_v2_cl_h2d_tensor_2d"); CL_CHECK(clFinish(queue), "clFinish"); @@ -574,7 +574,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr if (status != clblast::StatusCode::kSuccess) { printf("\nF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status); - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } // copy dst to host @@ -584,13 +584,13 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } } - ggml_cl_pool_free(d_X, x_size); - ggml_cl_pool_free(d_Y, y_size); - ggml_cl_pool_free(d_D, d_size); + ggml_v2_cl_pool_free(d_X, x_size); + ggml_v2_cl_pool_free(d_Y, y_size); + ggml_v2_cl_pool_free(d_D, d_size); } -static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) { - GGML_ASSERT(fp16_support); +static void ggml_v2_cl_mul_mat_f16(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst, void * wdata, size_t /* wsize */) { + GGML_V2_ASSERT(fp16_support); const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; @@ -608,16 +608,16 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f); - const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f); + const ggml_v2_fp16_t alpha = ggml_v2_fp32_to_fp16(1.0f); + const ggml_v2_fp16_t beta = ggml_v2_fp32_to_fp16(0.0f); const int x_ne = ne01 * ne00; const int y_ne = ne11 * ne10; const int d_ne = ne11 * ne01; size_t x_size, y_size, d_size; - cl_mem d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY); - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY); - cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY); + cl_mem d_X = ggml_v2_cl_pool_malloc(sizeof(ggml_v2_fp16_t) * x_ne, &x_size, CL_MEM_READ_ONLY); + cl_mem d_Y = ggml_v2_cl_pool_malloc(sizeof(ggml_v2_fp16_t) * y_ne, &y_size, CL_MEM_READ_ONLY); + cl_mem d_D = ggml_v2_cl_pool_malloc(sizeof(ggml_v2_fp16_t) * d_ne, &d_size, CL_MEM_WRITE_ONLY); cl_int err; @@ -627,20 +627,20 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { // copy src0 to device - err = ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL); - CL_CHECK(err, "ggml_cl_h2d_tensor_2d"); + err = ggml_v2_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL); + CL_CHECK(err, "ggml_v2_cl_h2d_tensor_2d"); // convert src1 to fp16 // TODO: use multiple threads - ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02); + ggml_v2_fp16_t * const tmp = (ggml_v2_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02); char * src1i = (char *) src1->data + i03*nb13 + i02*nb12; if (src1_cont_rows) { if (src1_cont_cols) { - ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); + ggml_v2_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); } else { for (int64_t i01 = 0; i01 < ne11; i01++) { - ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10); + ggml_v2_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10); } } } @@ -648,14 +648,14 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr for (int64_t i01 = 0; i01 < ne11; i01++) { for (int64_t i00 = 0; i00 < ne10; i00++) { // very slow due to no inlining - tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10)); + tmp[i01*ne10 + i00] = ggml_v2_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10)); } } } // copy src1 to device - err |= clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL); - CL_CHECK(err, "ggml_cl_h2d_tensor_2d"); + err |= clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_v2_fp16_t) * y_ne, tmp, 0, NULL, NULL); + CL_CHECK(err, "ggml_v2_cl_h2d_tensor_2d"); CL_CHECK(clFinish(queue), "clFinish"); @@ -673,24 +673,24 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr if (status != clblast::StatusCode::kSuccess) { printf("\nF16 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status); - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } // copy dst to host, then convert to float - err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL); + err = clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_v2_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL); float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); - ggml_fp16_to_fp32_row(tmp, d, d_ne); + ggml_v2_fp16_to_fp32_row(tmp, d, d_ne); } } - ggml_cl_pool_free(d_X, x_size); - ggml_cl_pool_free(d_Y, y_size); - ggml_cl_pool_free(d_D, d_size); + ggml_v2_cl_pool_free(d_X, x_size); + ggml_v2_cl_pool_free(d_Y, y_size); + ggml_v2_cl_pool_free(d_D, d_size); } -static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_v2_cl_mul_mat_q_f32(const ggml_v2_tensor * src0, const ggml_v2_tensor * src1, ggml_v2_tensor * dst) { const int64_t ne00 = src0->ne[0]; const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; @@ -701,7 +701,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - const ggml_type type = src0->type; + const ggml_v2_type type = src0->type; const bool mul_mat_vec = ne11 == 1; const float alpha = 1.0f; @@ -709,39 +709,39 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * const int x_ne = ne01 * ne00; const int y_ne = ne11 * ne10; const int d_ne = ne11 * ne01; - const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type); + const size_t q_sz = ggml_v2_type_size(type) * x_ne / ggml_v2_blck_size(type); size_t x_size, y_size, d_size, q_size; cl_mem d_X; if (!mul_mat_vec) { - d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE); + d_X = ggml_v2_cl_pool_malloc(sizeof(float) * x_ne, &x_size, CL_MEM_READ_WRITE); } - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); - cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); + cl_mem d_Y = ggml_v2_cl_pool_malloc(sizeof(float) * y_ne, &y_size, CL_MEM_READ_ONLY); + cl_mem d_D = ggml_v2_cl_pool_malloc(sizeof(float) * d_ne, &d_size, CL_MEM_WRITE_ONLY); cl_mem d_Q; - if (src0->backend == GGML_BACKEND_CPU) { - d_Q = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); + if (src0->backend == GGML_V2_BACKEND_CPU) { + d_Q = ggml_v2_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); } - cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type); - cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type); - GGML_ASSERT(to_fp32_cl != nullptr); + cl_kernel* to_fp32_cl = ggml_v2_get_to_fp32_cl(type); + cl_kernel* dmmv = ggml_v2_get_dequantize_mul_mat_vec_cl(type); + GGML_V2_ASSERT(to_fp32_cl != nullptr); for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { cl_event ev_sgemm; // copy src0 to device if necessary - if (src0->backend == GGML_BACKEND_CPU) { - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL), "ggml_cl_h2d_tensor_2d"); - } else if (src0->backend == GGML_BACKEND_CL) { + if (src0->backend == GGML_V2_BACKEND_CPU) { + CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, NULL), "ggml_v2_cl_h2d_tensor_2d"); + } else if (src0->backend == GGML_V2_BACKEND_CL) { d_Q = *(cl_mem*) src0->data; } else { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel // copy src1 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d"); + CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_v2_cl_h2d_tensor_2d"); // compute const size_t global = ne01 * CL_DMMV_BLOCK_SIZE; @@ -763,7 +763,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, NULL, &global, NULL, 0, NULL, NULL), "clEnqueueNDRangeKernel"); // copy src1 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_cl_h2d_tensor_2d"); + CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i03, i02, NULL), "ggml_v2_cl_h2d_tensor_2d"); // wait for conversion CL_CHECK(clFinish(queue), "clFinish"); @@ -781,7 +781,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * if (status != clblast::StatusCode::kSuccess) { printf("\nQF32 Matmul Failed (%d): You may be out of VRAM. Please check if you have enough.\n",status); - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } } @@ -793,146 +793,146 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } if (!mul_mat_vec) { - ggml_cl_pool_free(d_X, x_size); + ggml_v2_cl_pool_free(d_X, x_size); } - ggml_cl_pool_free(d_Y, y_size); - ggml_cl_pool_free(d_D, d_size); - if (src0->backend == GGML_BACKEND_CPU) { - ggml_cl_pool_free(d_Q, q_size); + ggml_v2_cl_pool_free(d_Y, y_size); + ggml_v2_cl_pool_free(d_D, d_size); + if (src0->backend == GGML_V2_BACKEND_CPU) { + ggml_v2_cl_pool_free(d_Q, q_size); } } -bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +bool ggml_v2_cl_can_mul_mat(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst) { const int64_t ne10 = src1->ne[0]; const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; // TODO: find the optimal values for these - if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && - src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 && - ((GetQuantsUnshuffled() && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) { + if ((src0->type == GGML_V2_TYPE_F32 || src0->type == GGML_V2_TYPE_F16 || ggml_v2_is_quantized(src0->type)) && + src1->type == GGML_V2_TYPE_F32 && + dst->type == GGML_V2_TYPE_F32 && + ((GetQuantsUnshuffled() && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_V2_BACKEND_CL)) { return true; } return false; } -bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { +bool ggml_v2_cl_mul_mat_use_f16(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * /* dst */) { // If device doesn't support FP16 if (!fp16_support) { return false; } - size_t src0_sz = ggml_nbytes(src0); - size_t src1_sz = ggml_nbytes(src1); + size_t src0_sz = ggml_v2_nbytes(src0); + size_t src1_sz = ggml_v2_nbytes(src1); // mul_mat_q: src0 is converted to fp32 on device size_t mul_mat_q_transfer = src0_sz + src1_sz; // mul_mat_f16: src1 is converted to fp16 on cpu - size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1); + size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_v2_fp16_t) * ggml_v2_nelements(src1); // choose the smaller one to transfer to the device // TODO: this is not always the best choice due to the overhead of converting to fp16 return mul_mat_f16_transfer < mul_mat_q_transfer; } -void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) { - GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); +void ggml_v2_cl_mul_mat(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst, void * wdata, size_t wsize) { + GGML_V2_ASSERT(ggml_v2_cl_can_mul_mat(src0, src1, dst)); - if (src0->type == GGML_TYPE_F32) { - ggml_cl_mul_mat_f32(src0, src1, dst); + if (src0->type == GGML_V2_TYPE_F32) { + ggml_v2_cl_mul_mat_f32(src0, src1, dst); } - else if (src0->type == GGML_TYPE_F16) { - if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) { - ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize); + else if (src0->type == GGML_V2_TYPE_F16) { + if (ggml_v2_cl_mul_mat_use_f16(src0, src1, dst)) { + ggml_v2_cl_mul_mat_f16(src0, src1, dst, wdata, wsize); } else { - ggml_cl_mul_mat_q_f32(src0, src1, dst); + ggml_v2_cl_mul_mat_q_f32(src0, src1, dst); } } - else if (ggml_is_quantized(src0->type)) { - ggml_cl_mul_mat_q_f32(src0, src1, dst); + else if (ggml_v2_is_quantized(src0->type)) { + ggml_v2_cl_mul_mat_q_f32(src0, src1, dst); } else { - GGML_ASSERT(false); + GGML_V2_ASSERT(false); } } -size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) { - return ggml_nelements(src1) * sizeof(ggml_fp16_t); +size_t ggml_v2_cl_mul_mat_get_wsize(const struct ggml_v2_tensor * src0, const struct ggml_v2_tensor * src1, struct ggml_v2_tensor * dst) { + if (ggml_v2_cl_mul_mat_use_f16(src0, src1, dst)) { + return ggml_v2_nelements(src1) * sizeof(ggml_v2_fp16_t); } return 0; } -void ggml_cl_transform_tensor(ggml_tensor * tensor) { +void ggml_v2_cl_transform_tensor(ggml_v2_tensor * tensor) { const int64_t ne0 = tensor->ne[0]; const int64_t ne1 = tensor->ne[1]; const int64_t ne2 = tensor->ne[2]; const int64_t ne3 = tensor->ne[3]; - const ggml_type type = tensor->type; - const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type); + const ggml_v2_type type = tensor->type; + const size_t q_sz = ggml_v2_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_v2_blck_size(type); size_t q_size; cl_mem* dst = (cl_mem*) malloc(sizeof(cl_mem)); - *dst = ggml_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); + *dst = ggml_v2_cl_pool_malloc(q_sz, &q_size, CL_MEM_READ_ONLY); // copy tensor to device for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i2 = 0; i2 < ne2; i2++) { int i = i3*ne2 + i2; - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL), "ggml_cl_h2d_tensor_2d"); + CL_CHECK(ggml_v2_cl_h2d_tensor_2d(queue, *dst, i*ne0*ne1, tensor, i3, i2, NULL), "ggml_v2_cl_h2d_tensor_2d"); } } CL_CHECK(clFinish(queue), "clFinish"); tensor->data = dst; - tensor->backend = GGML_BACKEND_CL; + tensor->backend = GGML_V2_BACKEND_CL; } -void ggml_cl_sgemm_wrapper( - const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, +void ggml_v2_cl_sgemm_wrapper( + const enum ggml_v2_blas_order order, const enum ggml_v2_blas_op trans_a, const enum ggml_v2_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype) { cl_int err = 0; - cl_kernel * kernel = ggml_get_to_fp32_cl((ggml_type)btype); + cl_kernel * kernel = ggml_v2_get_to_fp32_cl((ggml_v2_type)btype); size_t global = n * k, local, size_qb; bool dequant; switch (btype) { - case GGML_TYPE_F32: + case GGML_V2_TYPE_F32: dequant = false; break; - case GGML_TYPE_Q4_0: + case GGML_V2_TYPE_Q4_0: dequant = true; local = 16; size_qb = global * (sizeof(float) + local) / 32; break; - case GGML_TYPE_Q4_1: + case GGML_V2_TYPE_Q4_1: dequant = true; local = 16; size_qb = global * (sizeof(float) * 2 + local) / 32; break; - case GGML_TYPE_Q5_0: + case GGML_V2_TYPE_Q5_0: dequant = true; local = 16; - size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32; + size_qb = global * (sizeof(ggml_v2_fp16_t) + sizeof(uint32_t) + local) / 32; break; - case GGML_TYPE_Q5_1: + case GGML_V2_TYPE_Q5_1: dequant = true; local = 16; - size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32; + size_qb = global * (sizeof(ggml_v2_fp16_t) * 2 + sizeof(uint32_t) + local) / 32; break; - case GGML_TYPE_Q8_0: + case GGML_V2_TYPE_Q8_0: dequant = true; local = 32; size_qb = global * (sizeof(float) + local) / 32; @@ -947,12 +947,12 @@ void ggml_cl_sgemm_wrapper( const size_t size_c = m * n * sizeof(float); // Prepare buffers - ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a); + ggml_v2_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a); if (dequant) { - ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb); + ggml_v2_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb); } - ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b); - ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c); + ggml_v2_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b); + ggml_v2_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c); cl_event ev_a, ev_qb, ev_b; diff --git a/otherarch/ggml_v2.c b/otherarch/ggml_v2.c index 26c05725d..7599ddbc9 100644 --- a/otherarch/ggml_v2.c +++ b/otherarch/ggml_v2.c @@ -1463,7 +1463,7 @@ static void ggml_v2_vec_dot_q5_0_q8_0(const int n, float * restrict s, const voi static void ggml_v2_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); static void ggml_v2_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy); -static const quantize_fns_t quantize_fns[GGML_V2_TYPE_COUNT] = { +static const quantize_fns_t2 quantize_fns[GGML_V2_TYPE_COUNT] = { [GGML_V2_TYPE_Q4_0] = { .dequantize_row_q = (dequantize_row_q_t) dequantize_row_q4_0, .quantize_row_q = quantize_row_q4_0, @@ -1515,14 +1515,14 @@ static const quantize_fns_t quantize_fns[GGML_V2_TYPE_COUNT] = { }; // For internal test use -quantize_fns_t ggml_v2_internal_get_quantize_fn(size_t i) { +quantize_fns_t2 ggml_v2_internal_get_quantize_fn(size_t i) { GGML_V2_ASSERT(i < GGML_V2_TYPE_COUNT); return quantize_fns[i]; } bool quants_unshuffled = false; //new GGJT_2 is unshuffled, all old ones are shuffled -static const quantize_fns_t quantize_fns_v2[GGML_V2_TYPE_COUNT]; //forward decl -static inline quantize_fns_t get_quantize_fn(size_t i) +static const quantize_fns_t2 quantize_fns_v2[GGML_V2_TYPE_COUNT]; //forward decl +static inline quantize_fns_t2 get_quantize_fn(size_t i) { return(quants_unshuffled?quantize_fns[i]:quantize_fns_v2[i]); } @@ -17186,7 +17186,7 @@ inline bool GetQuantsUnshuffled() } //TODO: integrate backwards compat -static const quantize_fns_t quantize_fns_v2[GGML_V2_TYPE_COUNT] = { +static const quantize_fns_t2 quantize_fns_v2[GGML_V2_TYPE_COUNT] = { [GGML_V2_TYPE_Q4_0] = { .dequantize_row_q = dequantize_row_q4_0_v2, .quantize_row_q = quantize_row_q4_0_v2, diff --git a/otherarch/ggml_v2.h b/otherarch/ggml_v2.h index f24d6748d..dd95ab2ca 100644 --- a/otherarch/ggml_v2.h +++ b/otherarch/ggml_v2.h @@ -1134,9 +1134,9 @@ extern "C" { quantize_row_q_t quantize_row_q_dot; vec_dot_q_t vec_dot_q; enum ggml_v2_type vec_dot_type; - } quantize_fns_t; + } quantize_fns_t2; - quantize_fns_t ggml_v2_internal_get_quantize_fn(size_t i); + quantize_fns_t2 ggml_v2_internal_get_quantize_fn(size_t i); #ifdef __cplusplus } diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index dd356b39d..030b9d450 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -15,12 +15,9 @@ #include "model_adapter.h" -#if defined(GGML_USE_CLBLAST) -#include "ggml_v2-opencl.h" -#endif // load the model's weights from a file -ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) { +ModelLoadResult gpt2_v2_model_load(const std::string & fname, gpt2_v2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) { printf("%s: loading model from '%s'\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); @@ -379,8 +376,8 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g // - embd_inp: the embeddings of the tokens in the context // - embd_w: the predicted logits for the next token // -bool gpt2_eval( - const gpt2_model & model, +bool gpt2_v2_eval( + const gpt2_v2_model & model, const int n_threads, const int n_past, const std::vector & embd_inp, diff --git a/otherarch/gpt2_v3.cpp b/otherarch/gpt2_v3.cpp new file mode 100644 index 000000000..7e957e9e2 --- /dev/null +++ b/otherarch/gpt2_v3.cpp @@ -0,0 +1,696 @@ +#include "ggml.h" +#include "otherarch.h" + +#include "utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "model_adapter.h" + +#if defined(GGML_USE_CLBLAST) +#include "ggml-opencl.h" +#endif + +// load the model's weights from a file +ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) { + printf("%s: loading model from '%s'\n", __func__, fname.c_str()); + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return ModelLoadResult::FAIL; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != 0x67676d6c) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return ModelLoadResult::FAIL; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // load vocab + { + int32_t n_vocab = 0; + fin.read((char *) &n_vocab, sizeof(n_vocab)); + + if (n_vocab != model.hparams.n_vocab) { + fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", + __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); + return ModelLoadResult::FAIL; + } + + std::string word; + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + word.resize(len); + fin.read((char *) word.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + + auto memory_type = GGML_TYPE_F16; + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return ModelLoadResult::FAIL; + } + + auto & ctx = model.ctx; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte + ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe + ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + + ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w + ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + + ctx_size += 1.5*(n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // memory_k + ctx_size += 1.5*(n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // memory_v + + ctx_size += (6 + 12*n_layer)*512; // object overhead + + printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor)); + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params; + params.mem_size = ctx_size; + params.mem_buffer = NULL; + params.no_alloc = false; + + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return ModelLoadResult::FAIL; + } + } + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + model.layers.resize(n_layer); + + model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx); + model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + // map by name + model.tensors["model/ln_f/g"] = model.ln_f_g; + model.tensors["model/ln_f/b"] = model.ln_f_b; + + model.tensors["model/wte"] = model.wte; + model.tensors["model/wpe"] = model.wpe; + model.tensors["model/lm_head"] = model.lm_head; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); + layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); + + layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // map by name + model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g; + model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b; + + model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g; + model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b; + + model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w; + model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b; + + model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w; + model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b; + + model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; + model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; + + model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w; + model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; + } + } + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements*1.5); + model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements*1.5); + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + + // load weights + { + size_t total_size = 0; + + bool has_lm_head = false; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name.data()) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + return ModelLoadResult::FAIL; + } + + auto tensor = model.tensors[name.data()]; + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); + return ModelLoadResult::FAIL; + } + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n", + __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); + return ModelLoadResult::FAIL; + } + + // for debugging + if (0) { + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + const size_t bpe = ggml_type_size(ggml_type(ttype)); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + return ModelLoadResult::FAIL; + } + + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + + // GPT-2 models share the WTE tensor as the LM head + if (name == "model/wte" && has_lm_head == false) { + memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor)); + } + + if (name == "model/lm_head") { + has_lm_head = true; + } + + total_size += ggml_nbytes(tensor); + } + + printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + } + + fin.close(); + + +// //gpu offload for gpt2 +// #if defined(GGML_USE_CLBLAST) +// if(gpulayers>0) +// { +// const auto & hparams = model.hparams; +// const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); +// if(GetQuantsUnshuffled()) +// { +// SetGPULayers(n_gpu); + +// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + +// size_t vram_total = 0; + +// for (int i = 0; i < n_gpu; ++i) { +// const auto & layer = model.layers[i]; + +// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g); +// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b); +// ggml_cl_transform_tensor(layer.ln_2_g); vram_total += ggml_nbytes(layer.ln_2_g); +// ggml_cl_transform_tensor(layer.ln_2_b); vram_total += ggml_nbytes(layer.ln_2_b); +// ggml_cl_transform_tensor(layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); +// ggml_cl_transform_tensor(layer.c_attn_attn_b); vram_total += ggml_nbytes(layer.c_attn_attn_b); +// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); +// ggml_cl_transform_tensor(layer.c_attn_proj_b); vram_total += ggml_nbytes(layer.c_attn_proj_b); +// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); +// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b); +// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); +// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b); +// } + +// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); +// } +// else +// { +// if(n_gpu>0) +// { +// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n"); +// } +// } +// } +// #endif + + + return ModelLoadResult::SUCCESS; +} + +// evaluate the transformer +// +// - model: the model +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool gpt2_eval( + const gpt2_model & model, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w, + size_t & mem_per_token, + FileFormat file_format) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + const int n_vocab = hparams.n_vocab; + + static size_t buf_size = 256u*1024*1024; + static void * buf = malloc(buf_size); + + if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) { + const size_t buf_size_new = 320u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead + //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + + // reallocate + if (buf_size_new > buf_size) + { + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) + { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } + } + } + + struct ggml_init_params params; + params.mem_size = buf_size; + params.mem_buffer = buf; + params.no_alloc = false; + + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph gf = {}; + gf.n_threads = n_threads; + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + + struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + for (int i = 0; i < N; ++i) { + ((int32_t *) position->data)[i] = n_past + i; + } + + // wte + wpe + struct ggml_tensor * inpL = + ggml_add(ctx0, + ggml_get_rows(ctx0, model.wte, embd), + ggml_get_rows(ctx0, model.wpe, position)); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + // norm + { + // [ 768, N] + cur = ggml_norm(ctx0, inpL); + + // cur = ln_1_g*cur + ln_1_b + // [ 768, N] + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + } + + // attn + // [2304, 768] - model.layers[il].c_attn_attn_w + // [2304, 1] - model.layers[il].c_attn_attn_b + // [ 768, N] - cur (in) + // [2304, N] - cur (out) + // + // cur = attn_w*cur + attn_b + // [2304, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_attn_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), + cur); + } + + // self-attention + { + struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + // [64, N, 12] + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + 0, 2, 1, 3); + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + // [64, n_past + N, 12] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); + + // GG: flash attention + //struct ggml_tensor * V = + // ggml_cpy(ctx0, + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 1, 2, 0, 3), + // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); + + //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); + + // K * Q + // [n_past + N, N, 12] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) + ); + + // KQ_masked = mask_past(KQ_scaled) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + // [n_past + N, 64, 12] + struct ggml_tensor * V_trans = + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + n_embd/n_head, n_head, n_past + N), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); + + // KQV = transpose(V) * KQ_soft_max + // [64, N, 12] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // [64, 12, N] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + // cur = KQV_merged.contiguous().view(n_embd, N) + // [768, N] + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + } + + // projection + // [ 768, 768] - model.layers[il].c_attn_proj_w + // [ 768, 1] - model.layers[il].c_attn_proj_b + // [ 768, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), + cur); + } + + // add the input + cur = ggml_add(ctx0, cur, inpL); + + struct ggml_tensor * inpFF = cur; + + // feed-forward network + { + // norm + { + cur = ggml_norm(ctx0, inpFF); + + // cur = ln_2_g*cur + ln_2_b + // [ 768, N] + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_2_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_2_b, cur)); + } + + // fully connected + // [3072, 768] - model.layers[il].c_mlp_fc_w + // [3072, 1] - model.layers[il].c_mlp_fc_b + // [ 768, N] - cur (in) + // [3072, N] - cur (out) + // + // cur = fc_w*cur + fc_b + // [3072, N] + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_fc_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), + cur); + + // GELU activation + // [3072, N] + cur = ggml_gelu(ctx0, cur); + + // projection + // [ 768, 3072] - model.layers[il].c_mlp_proj_w + // [ 768, 1] - model.layers[il].c_mlp_proj_b + // [3072, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_proj_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), + cur); + } + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + } + + // norm + { + // [ 768, N] + inpL = ggml_norm(ctx0, inpL); + + // inpL = ln_f_g*inpL + ln_f_b + // [ 768, N] + inpL = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL), + ggml_repeat(ctx0, model.ln_f_b, inpL)); + } + + // inpL = WTE * inpL + // [ 768, 50257] - model.lm_head + // [ 768, N] - inpL + inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + + // logits -> probs + //inpL = ggml_soft_max_inplace(ctx0, inpL); + + // run the computation + ggml_build_forward_expand(&gf, inpL); + ggml_graph_compute (ctx0, &gf); + + //if (n_past%100 == 0) { + // ggml_graph_print (&gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result just for the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); + + ggml_free(ctx0); + + return true; +} \ No newline at end of file diff --git a/otherarch/gptj_v1.cpp b/otherarch/gptj_v1.cpp index 67bc15062..f112e8449 100644 --- a/otherarch/gptj_v1.cpp +++ b/otherarch/gptj_v1.cpp @@ -16,7 +16,7 @@ // load the model's weights from a file -ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1 & model, gpt_vocab & vocab, FileFormat file_format) { +ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_v1_model & model, gpt_vocab & vocab, FileFormat file_format) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); bool super_old_format = (file_format==FileFormat::GPTJ_1); @@ -363,7 +363,7 @@ ModelLoadResult legacy_gptj_model_load(const std::string & fname, gptj_model_v1 // The GPT-J model requires about 16MB of memory per input token. // bool legacy_gptj_eval( - const gptj_model_v1 & model, + const gptj_v1_model & model, const int n_threads, const int n_past, const std::vector & embd_inp, diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index 0b678df65..e84142798 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -18,7 +18,7 @@ // load the model's weights from a file -ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) { +ModelLoadResult gptj_v2_model_load(const std::string & fname, gptj_v2_model & model, gpt_vocab & vocab, int gpulayers) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); @@ -382,8 +382,8 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g // // The GPT-J model requires about 16MB of memory per input token. // -bool gptj_eval( - const gptj_model & model, +bool gptj_v2_eval( + const gptj_v2_model & model, const int n_threads, const int n_past, const std::vector & embd_inp, diff --git a/otherarch/gptj_v3.cpp b/otherarch/gptj_v3.cpp new file mode 100644 index 000000000..4b207e0bd --- /dev/null +++ b/otherarch/gptj_v3.cpp @@ -0,0 +1,613 @@ +#include "ggml.h" +#include "otherarch.h" + +#include "utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "model_adapter.h" + + + +// load the model's weights from a file +ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) { + printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return ModelLoadResult::FAIL; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != 0x67676d6c) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return ModelLoadResult::FAIL; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: n_rot = %d\n", __func__, hparams.n_rot); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // load vocab + { + int32_t n_vocab = 0; + fin.read((char *) &n_vocab, sizeof(n_vocab)); + + if (n_vocab != model.hparams.n_vocab) { + fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", + __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); + return ModelLoadResult::FAIL; + } + + std::string word; + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + word.resize(len); + fin.read((char *) word.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return ModelLoadResult::FAIL; + } + + auto & ctx = model.ctx; + + auto memory_type = GGML_TYPE_F16; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + + ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte + + ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g + ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v + + ctx_size += (5 + 10*n_layer)*512; // object overhead + + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params; + params.mem_size = ctx_size; + params.mem_buffer = NULL; + params.no_alloc = false; + + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return ModelLoadResult::FAIL; + } + } + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_vocab = hparams.n_vocab; + + model.layers.resize(n_layer); + + model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab); + + // map by name + model.tensors["transformer.wte.weight"] = model.wte; + + model.tensors["transformer.ln_f.weight"] = model.ln_f_g; + model.tensors["transformer.ln_f.bias"] = model.ln_f_b; + + model.tensors["lm_head.weight"] = model.lmh_g; + model.tensors["lm_head.bias"] = model.lmh_b; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + + layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + + layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // map by name + model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"] = layer.ln_1_g; + model.tensors["transformer.h." + std::to_string(i) + ".ln_1.bias"] = layer.ln_1_b; + + model.tensors["transformer.h." + std::to_string(i) + ".attn.q_proj.weight"] = layer.c_attn_q_proj_w; + model.tensors["transformer.h." + std::to_string(i) + ".attn.k_proj.weight"] = layer.c_attn_k_proj_w; + model.tensors["transformer.h." + std::to_string(i) + ".attn.v_proj.weight"] = layer.c_attn_v_proj_w; + + model.tensors["transformer.h." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w; + + model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"] = layer.c_mlp_fc_w; + model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"] = layer.c_mlp_fc_b; + + model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"] = layer.c_mlp_proj_w; + model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"] = layer.c_mlp_proj_b; + } + } + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements); + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + + // load weights + { + int n_tensors = 0; + size_t total_size = 0; + + printf("%s: ", __func__); + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name.data()) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + return ModelLoadResult::FAIL; + } + + auto tensor = model.tensors[name.data()]; + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); + return ModelLoadResult::FAIL; + } + + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + + //test for transposition and retry older loader + if(tensor->ne[0]==ne[1] && tensor->ne[1]==ne[0] && should_transpose_layer(name)) + { + printf("\nFound a transposed tensor. This could be an older or newer model. Retrying load..."); + ggml_free(ctx); + return ModelLoadResult::RETRY_LOAD; + } + else + { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); + return ModelLoadResult::FAIL; + } + + } + + // for debugging + if (0) { + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + const size_t bpe = ggml_type_size(ggml_type(ttype)); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + return ModelLoadResult::FAIL; + } + + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + + //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); + total_size += ggml_nbytes(tensor); + if (++n_tensors % 8 == 0) { + printf("."); + fflush(stdout); + } + } + + printf(" done\n"); + + printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); + } + + fin.close(); + +// //gpu offload for gptj +// #if defined(GGML_USE_CLBLAST) +// if(gpulayers>0) +// { +// const auto & hparams = model.hparams; +// const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); +// if(GetQuantsUnshuffled()) +// { +// SetGPULayers(n_gpu); + +// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + +// size_t vram_total = 0; + +// for (int i = 0; i < n_gpu; ++i) { +// const auto & layer = model.layers[i]; + +// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g); +// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b); +// ggml_cl_transform_tensor(layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w); +// ggml_cl_transform_tensor(layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w); +// ggml_cl_transform_tensor(layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w); +// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); +// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); +// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b); +// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); +// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b); +// } + +// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); +// } +// else +// { +// if(n_gpu>0) +// { +// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n"); +// } +// } +// } +// #endif + + + return ModelLoadResult::SUCCESS; +} + +// evaluate the transformer +// +// - model: the model +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +// The GPT-J model requires about 16MB of memory per input token. +// +bool gptj_eval( + const gptj_model & model, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w, + size_t & mem_per_token) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + const int n_vocab = hparams.n_vocab; + const int n_rot = hparams.n_rot; + + static size_t buf_size = 256u*1024*1024; + static void * buf = malloc(buf_size); + + if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) { + const size_t buf_size_new = 320u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead + //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + + // reallocate + if (buf_size_new > buf_size) + { + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) + { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } + } + } + + struct ggml_init_params params; + params.mem_size = buf_size; + params.mem_buffer = buf; + params.no_alloc = false; + + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph gf = {}; + gf.n_threads = n_threads; + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + + // wte + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + // norm + { + cur = ggml_norm(ctx0, inpL); + + // cur = ln_1_g*cur + ln_1_b + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + } + + struct ggml_tensor * inpSA = cur; + + // self-attention + { + struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0); + + // store key and value to memory + { + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur)); + + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, + ( n_ctx)*ggml_element_size(model.memory_v), + (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v)); + + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) + ); + + // KQ_masked = mask_past(KQ_scaled) + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + struct ggml_tensor * V = + ggml_view_3d(ctx0, model.memory_v, + n_past + N, n_embd/n_head, n_head, + n_ctx*ggml_element_size(model.memory_v), + n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head, + il*n_ctx*ggml_element_size(model.memory_v)*n_embd); + + // KQV = transpose(V) * KQ_soft_max + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + // cur = KQV_merged.contiguous().view(n_embd, N) + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection (no bias) + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + } + + struct ggml_tensor * inpFF = cur; + + // feed-forward network + // this is independent of the self-attention result, so it could be done in parallel to the self-attention + { + // note here we pass inpSA instead of cur + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_fc_w, + inpSA); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur), + cur); + + // GELU activation + cur = ggml_gelu(ctx0, cur); + + // projection + // cur = proj_w*cur + proj_b + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_proj_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur), + cur); + } + + // self-attention + FF + cur = ggml_add(ctx0, cur, inpFF); + + // input for next layer + inpL = ggml_add(ctx0, cur, inpL); + } + + // norm + { + inpL = ggml_norm(ctx0, inpL); + + // inpL = ln_f_g*inpL + ln_f_b + inpL = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL), + ggml_repeat(ctx0, model.ln_f_b, inpL)); + } + + // lm_head + { + inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); + + inpL = ggml_add(ctx0, + ggml_repeat(ctx0, model.lmh_b, inpL), + inpL); + } + + // logits -> probs + //inpL = ggml_soft_max_inplace(ctx0, inpL); + + // run the computation + ggml_build_forward_expand(&gf, inpL); + ggml_graph_compute (ctx0, &gf); + + //if (n_past%100 == 0) { + // ggml_graph_print (&gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-j.dot"); + //} + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result for just the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); + + ggml_free(ctx0); + + return true; +} \ No newline at end of file diff --git a/otherarch/llama_v2-util.h b/otherarch/llama_v2-util.h new file mode 100644 index 000000000..cf1b5860c --- /dev/null +++ b/otherarch/llama_v2-util.h @@ -0,0 +1,464 @@ +// Internal header to be included only by llama.cpp. +// Contains wrappers around OS interfaces. +#pragma once +#ifndef LLAMA_V2_UTIL_H +#define LLAMA_V2_UTIL_H + +#include "llama-util.h" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef __has_include + #if __has_include() + #include + #if defined(_POSIX_MAPPED_FILES) + #include + #endif + #if defined(_POSIX_MEMLOCK_RANGE) + #include + #endif + #endif +#endif + +#if defined(_WIN32) + #define WIN32_LEAN_AND_MEAN + #ifndef NOMINMAX + #define NOMINMAX + #endif + #include + #include + #include // for _fseeki64 +#endif + +#define LLAMA_V2_ASSERT(x) \ + do { \ + if (!(x)) { \ + fprintf(stderr, "LLAMA_V2_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \ + abort(); \ + } \ + } while (0) + +#ifdef __GNUC__ +#ifdef __MINGW32__ +__attribute__((format(gnu_printf, 1, 2))) +#else +__attribute__((format(printf, 1, 2))) +#endif +#endif + + +struct llama_v2_file { + // use FILE * so we don't have to re-open the file to mmap + FILE * fp; + size_t size; + + llama_v2_file(const char * fname, const char * mode) { + fp = std::fopen(fname, mode); + if (fp == NULL) { + throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); + } + seek(0, SEEK_END); + size = tell(); + seek(0, SEEK_SET); + } + + size_t tell() const { +#ifdef _WIN32 + __int64 ret = _ftelli64(fp); +#else + long ret = std::ftell(fp); +#endif + LLAMA_V2_ASSERT(ret != -1); // this really shouldn't fail + return (size_t) ret; + } + + void seek(size_t offset, int whence) { +#ifdef _WIN32 + int ret = _fseeki64(fp, (__int64) offset, whence); +#else + int ret = std::fseek(fp, (long) offset, whence); +#endif + LLAMA_V2_ASSERT(ret == 0); // same + } + + void read_raw(void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + std::size_t ret = std::fread(ptr, size, 1, fp); + if (ferror(fp)) { + throw std::runtime_error(format("read error: %s", strerror(errno))); + } + if (ret != 1) { + throw std::runtime_error(std::string("unexpectedly reached end of file")); + } + } + + std::uint32_t read_u32() { + std::uint32_t ret; + read_raw(&ret, sizeof(ret)); + return ret; + } + + std::string read_string(std::uint32_t len) { + std::vector chars(len); + read_raw(chars.data(), len); + return std::string(chars.data(), len); + } + + void write_raw(const void * ptr, size_t size) { + if (size == 0) { + return; + } + errno = 0; + size_t ret = std::fwrite(ptr, size, 1, fp); + if (ret != 1) { + throw std::runtime_error(format("write error: %s", strerror(errno))); + } + } + + void write_u32(std::uint32_t val) { + write_raw(&val, sizeof(val)); + } + + ~llama_v2_file() { + if (fp) { + std::fclose(fp); + } + } +}; + +#if defined(_WIN32) +static std::string llama_v2_format_win_err(DWORD err) { + LPSTR buf; + size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL); + if (!size) { + return "FormatMessageA failed"; + } + std::string ret(buf, size); + LocalFree(buf); + return ret; +} +#endif + +struct llama_v2_mmap { + void * addr; + size_t size; + + llama_v2_mmap(const llama_v2_mmap &) = delete; + +#ifdef _POSIX_MAPPED_FILES + static constexpr bool SUPPORTED = true; + + llama_v2_mmap(struct llama_v2_file * file, bool prefetch = true) { + size = file->size; + int fd = fileno(file->fp); + int flags = MAP_SHARED; +#ifdef __linux__ + flags |= MAP_POPULATE; +#endif + addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); + if (addr == MAP_FAILED) { + throw std::runtime_error(format("mmap failed: %s", strerror(errno))); + } + + if (prefetch) { + // Advise the kernel to preload the mapped memory + if (madvise(addr, file->size, MADV_WILLNEED)) { + fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", + strerror(errno)); + } + } + } + + ~llama_v2_mmap() { + munmap(addr, size); + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + llama_v2_mmap(struct llama_v2_file * file, bool prefetch = true) { + size = file->size; + + HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); + + HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); + DWORD error = GetLastError(); + + if (hMapping == NULL) { + throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_v2_format_win_err(error).c_str())); + } + + addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0); + error = GetLastError(); + CloseHandle(hMapping); + + if (addr == NULL) { + throw std::runtime_error(format("MapViewOfFile failed: %s", llama_v2_format_win_err(error).c_str())); + } + + #if _WIN32_WINNT >= _WIN32_WINNT_WIN8 + if (prefetch) { + // Advise the kernel to preload the mapped memory + WIN32_MEMORY_RANGE_ENTRY range; + range.VirtualAddress = addr; + range.NumberOfBytes = (SIZE_T)size; + if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) { + fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n", + llama_v2_format_win_err(GetLastError()).c_str()); + } + } + #else + #pragma message("warning: You are building for pre-Windows 8; prefetch not supported") + #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8 + } + + ~llama_v2_mmap() { + if (!UnmapViewOfFile(addr)) { + fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n", + llama_v2_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + llama_v2_mmap(struct llama_v2_file *, bool prefetch = true) { + (void)prefetch; + throw std::runtime_error(std::string("mmap not supported")); + } +#endif +}; + +// Represents some region of memory being locked using mlock or VirtualLock; +// will automatically unlock on destruction. +struct llama_v2_mlock { + void * addr = NULL; + size_t size = 0; + bool failed_already = false; + + llama_v2_mlock() {} + llama_v2_mlock(const llama_v2_mlock &) = delete; + + ~llama_v2_mlock() { + if (size) { + raw_unlock(addr, size); + } + } + + void init(void * addr) { + LLAMA_V2_ASSERT(this->addr == NULL && this->size == 0); + this->addr = addr; + } + + void grow_to(size_t target_size) { + LLAMA_V2_ASSERT(addr); + if (failed_already) { + return; + } + size_t granularity = lock_granularity(); + target_size = (target_size + granularity - 1) & ~(granularity - 1); + if (target_size > size) { + if (raw_lock((uint8_t *) addr + size, target_size - size)) { + size = target_size; + } else { + failed_already = true; + } + } + } + +#ifdef _POSIX_MEMLOCK_RANGE + static constexpr bool SUPPORTED = true; + + size_t lock_granularity() { + return (size_t) sysconf(_SC_PAGESIZE); + } + + #ifdef __APPLE__ + #define MLOCK_SUGGESTION \ + "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \ + "decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n" + #else + #define MLOCK_SUGGESTION \ + "Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n" + #endif + + bool raw_lock(const void * addr, size_t size) { + if (!mlock(addr, size)) { + return true; + } else { + char* errmsg = std::strerror(errno); + bool suggest = (errno == ENOMEM); + + // Check if the resource limit is fine after all + struct rlimit lock_limit; + if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) + suggest = false; + if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) + suggest = false; + + fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s", + size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : ""); + return false; + } + } + + #undef MLOCK_SUGGESTION + + void raw_unlock(void * addr, size_t size) { + if (munlock(addr, size)) { + fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno)); + } + } +#elif defined(_WIN32) + static constexpr bool SUPPORTED = true; + + size_t lock_granularity() { + SYSTEM_INFO si; + GetSystemInfo(&si); + return (size_t) si.dwPageSize; + } + + bool raw_lock(void * addr, size_t size) { + for (int tries = 1; ; tries++) { + if (VirtualLock(addr, size)) { + return true; + } + if (tries == 2) { + fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n", + size, this->size, llama_v2_format_win_err(GetLastError()).c_str()); + return false; + } + + // It failed but this was only the first try; increase the working + // set size and try again. + SIZE_T min_ws_size, max_ws_size; + if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) { + fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n", + llama_v2_format_win_err(GetLastError()).c_str()); + return false; + } + // Per MSDN: "The maximum number of pages that a process can lock + // is equal to the number of pages in its minimum working set minus + // a small overhead." + // Hopefully a megabyte is enough overhead: + size_t increment = size + 1048576; + // The minimum must be <= the maximum, so we need to increase both: + min_ws_size += increment; + max_ws_size += increment; + if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { + fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", + llama_v2_format_win_err(GetLastError()).c_str()); + return false; + } + } + } + + void raw_unlock(void * addr, size_t size) { + if (!VirtualUnlock(addr, size)) { + fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n", + llama_v2_format_win_err(GetLastError()).c_str()); + } + } +#else + static constexpr bool SUPPORTED = false; + + size_t lock_granularity() { + return (size_t) 65536; + } + + bool raw_lock(const void * addr, size_t size) { + fprintf(stderr, "warning: mlock not supported on this system\n"); + return false; + } + + void raw_unlock(const void * addr, size_t size) {} +#endif +}; + +// Replacement for std::vector that doesn't require zero-initialization. +struct llama_v2_buffer { + uint8_t * addr = NULL; + size_t size = 0; + + llama_v2_buffer() = default; + + void resize(size_t size) { + delete[] addr; + addr = new uint8_t[size]; + this->size = size; + } + + ~llama_v2_buffer() { + delete[] addr; + } + + // disable copy and move + llama_v2_buffer(const llama_v2_buffer&) = delete; + llama_v2_buffer(llama_v2_buffer&&) = delete; + llama_v2_buffer& operator=(const llama_v2_buffer&) = delete; + llama_v2_buffer& operator=(llama_v2_buffer&&) = delete; +}; + +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +struct llama_v2_ctx_buffer { + uint8_t * addr = NULL; + bool is_cuda; + size_t size = 0; + + llama_v2_ctx_buffer() = default; + + void resize(size_t size) { + free(); + + addr = (uint8_t *) ggml_cuda_host_malloc(size); + if (addr) { + is_cuda = true; + } + else { + // fall back to pageable memory + addr = new uint8_t[size]; + is_cuda = false; + } + this->size = size; + } + + void free() { + if (addr) { + if (is_cuda) { + ggml_cuda_host_free(addr); + } + else { + delete[] addr; + } + } + addr = NULL; + } + + ~llama_v2_ctx_buffer() { + free(); + } + + // disable copy and move + llama_v2_ctx_buffer(const llama_v2_ctx_buffer&) = delete; + llama_v2_ctx_buffer(llama_v2_ctx_buffer&&) = delete; + llama_v2_ctx_buffer& operator=(const llama_v2_ctx_buffer&) = delete; + llama_v2_ctx_buffer& operator=(llama_v2_ctx_buffer&&) = delete; +}; +#else +typedef llama_v2_buffer llama_v2_ctx_buffer; +#endif + +#endif diff --git a/otherarch/llama_v2.cpp b/otherarch/llama_v2.cpp index 85de44e06..297ba873e 100644 --- a/otherarch/llama_v2.cpp +++ b/otherarch/llama_v2.cpp @@ -5,7 +5,7 @@ #include #endif -#include "llama-util.h" +#include "llama_v2-util.h" #include "llama_v2.h" #include "ggml_v2.h" @@ -39,67 +39,67 @@ #define LLAMA_V2_MAX_SCRATCH_BUFFERS 16 // available llama models -enum e_model { - MODEL_UNKNOWN, - MODEL_7B, - MODEL_13B, - MODEL_30B, - MODEL_65B, +enum e_model2 { + MODEL_UNKNOWN_2, + MODEL_7B_2, + MODEL_13B_2, + MODEL_30B_2, + MODEL_65B_2, }; -static const size_t MB = 1024*1024; +static const size_t MB_2 = 1024*1024; // computed for n_ctx == 2048 // TODO: dynamically determine these sizes // needs modifications in ggml -static const std::map & MEM_REQ_SCRATCH0() +static const std::map & MEM_REQ_SCRATCH0_2() { - static std::map k_sizes = { - { MODEL_UNKNOWN, 512ull * MB }, - { MODEL_7B, 512ull * MB }, - { MODEL_13B, 512ull * MB }, - { MODEL_30B, 512ull * MB }, - { MODEL_65B, 1024ull * MB }, + static std::map k_sizes = { + { MODEL_UNKNOWN_2, 512ull * MB_2 }, + { MODEL_7B_2, 512ull * MB_2 }, + { MODEL_13B_2, 512ull * MB_2 }, + { MODEL_30B_2, 512ull * MB_2 }, + { MODEL_65B_2, 1024ull * MB_2 }, }; return k_sizes; } -static const std::map & MEM_REQ_SCRATCH1() +static const std::map & MEM_REQ_SCRATCH1_2() { - static std::map k_sizes = { - { MODEL_UNKNOWN, 512ull * MB }, - { MODEL_7B, 512ull * MB }, - { MODEL_13B, 512ull * MB }, - { MODEL_30B, 512ull * MB }, - { MODEL_65B, 1024ull * MB }, + static std::map k_sizes = { + { MODEL_UNKNOWN_2, 512ull * MB_2 }, + { MODEL_7B_2, 512ull * MB_2 }, + { MODEL_13B_2, 512ull * MB_2 }, + { MODEL_30B_2, 512ull * MB_2 }, + { MODEL_65B_2, 1024ull * MB_2 }, }; return k_sizes; } // 2*n_embd*n_ctx*n_layer*sizeof(float16) -static const std::map & MEM_REQ_KV_SELF() +static const std::map & MEM_REQ_KV_SELF_2() { - static std::map k_sizes = { - { MODEL_UNKNOWN, 1026ull * MB }, - { MODEL_7B, 1026ull * MB }, - { MODEL_13B, 1608ull * MB }, - { MODEL_30B, 3124ull * MB }, - { MODEL_65B, 5120ull * MB }, + static std::map k_sizes = { + { MODEL_UNKNOWN_2, 1026ull * MB_2 }, + { MODEL_7B_2, 1026ull * MB_2 }, + { MODEL_13B_2, 1608ull * MB_2 }, + { MODEL_30B_2, 3124ull * MB_2 }, + { MODEL_65B_2, 5120ull * MB_2 }, }; return k_sizes; } // this is mostly needed for temporary mul_mat buffers to dequantize the data // not actually needed if BLAS is disabled -static const std::map & MEM_REQ_EVAL() +static const std::map & MEM_REQ_EVAL_2() { - static std::map k_sizes = { - { MODEL_UNKNOWN, 800ull * MB }, - { MODEL_7B, 800ull * MB }, - { MODEL_13B, 1024ull * MB }, - { MODEL_30B, 1280ull * MB }, - { MODEL_65B, 1536ull * MB }, + static std::map k_sizes = { + { MODEL_UNKNOWN_2, 800ull * MB_2 }, + { MODEL_7B_2, 800ull * MB_2 }, + { MODEL_13B_2, 1024ull * MB_2 }, + { MODEL_30B_2, 1280ull * MB_2 }, + { MODEL_65B_2, 1536ull * MB_2 }, }; return k_sizes; } @@ -157,7 +157,7 @@ struct llama_v2_kv_cache { }; struct llama_v2_model { - e_model type = MODEL_UNKNOWN; + e_model2 type = MODEL_UNKNOWN_2; llama_v2_hparams hparams; @@ -276,7 +276,7 @@ struct llama_v2_context { }; template -static T checked_mul(T a, T b) { +static T checked_mul2(T a, T b) { T ret = a * b; if (a != 0 && ret / a != b) { throw format("overflow multiplying %llu * %llu", @@ -285,7 +285,7 @@ static T checked_mul(T a, T b) { return ret; } -static size_t checked_div(size_t a, size_t b) { +static size_t checked_div2(size_t a, size_t b) { if (b == 0 || a % b != 0) { throw format("error dividing %zu / %zu", a, b); } @@ -304,7 +304,7 @@ static std::string llama_v2_format_tensor_shape(const std::vector & ne static size_t llama_v2_calc_tensor_size(const std::vector & ne, enum ggml_v2_type type) { size_t size = ggml_v2_type_size(type); for (uint32_t dim : ne) { - size = checked_mul(size, dim); + size = checked_mul2(size, dim); } return size / ggml_v2_blck_size(type); } @@ -322,9 +322,9 @@ struct llama_v2_load_tensor_shard { }; enum llama_v2_split_type { - SPLIT_NONE, - SPLIT_BY_COLUMNS, - SPLIT_BY_ROWS + SPLIT_NONE_2, + SPLIT_BY_COLUMNS_2, + SPLIT_BY_ROWS_2 }; struct llama_v2_load_tensor { @@ -332,7 +332,7 @@ struct llama_v2_load_tensor { std::string name; enum ggml_v2_type type = GGML_V2_TYPE_F32; - llama_v2_split_type split_type = SPLIT_NONE; + llama_v2_split_type split_type = SPLIT_NONE_2; std::vector ne; size_t size; struct ggml_v2_tensor * ggml_v2_tensor = NULL; @@ -360,13 +360,13 @@ struct llama_v2_load_tensor { void calc_split_type() { if (shards.at(0).ne.size() == 1 || // 1D tensors are just duplicated in every file shards.size() == 1) { // only one file? - split_type = SPLIT_NONE; + split_type = SPLIT_NONE_2; } else if (name.find("tok_embeddings.") == 0 || name.find(".attention.wo.weight") != std::string::npos || name.find(".feed_forward.w2.weight") != std::string::npos) { - split_type = SPLIT_BY_COLUMNS; + split_type = SPLIT_BY_COLUMNS_2; } else { - split_type = SPLIT_BY_ROWS; + split_type = SPLIT_BY_ROWS_2; } } @@ -382,16 +382,16 @@ struct llama_v2_load_tensor { LLAMA_V2_ASSERT(shards.size() <= UINT32_MAX); uint32_t n_shards = (uint32_t) shards.size(); switch (split_type) { - case SPLIT_NONE: + case SPLIT_NONE_2: ne = first_shard.ne; break; - case SPLIT_BY_COLUMNS: - ne = {checked_mul(first_shard.ne[0], n_shards), + case SPLIT_BY_COLUMNS_2: + ne = {checked_mul2(first_shard.ne[0], n_shards), first_shard.ne[1]}; break; - case SPLIT_BY_ROWS: + case SPLIT_BY_ROWS_2: ne = {first_shard.ne[0], - checked_mul(first_shard.ne[1], n_shards)}; + checked_mul2(first_shard.ne[1], n_shards)}; break; } } @@ -737,11 +737,11 @@ struct llama_v2_model_loader { if (use_mmap) { LLAMA_V2_ASSERT(lt.shards.size() == 1); lt.data = (uint8_t *) mapping->addr + lt.shards.at(0).file_off; - } else if (lt.split_type == SPLIT_NONE) { + } else if (lt.split_type == SPLIT_NONE_2) { llama_v2_file & file = file_loaders.at(lt.shards.at(0).file_idx)->file; file.seek(lt.shards.at(0).file_off, SEEK_SET); file.read_raw(lt.data, lt.size); - } else if (lt.split_type == SPLIT_BY_ROWS) { + } else if (lt.split_type == SPLIT_BY_ROWS_2) { size_t offset = 0; for (llama_v2_load_tensor_shard & shard : lt.shards) { llama_v2_file & file = file_loaders.at(shard.file_idx)->file; @@ -750,7 +750,7 @@ struct llama_v2_model_loader { offset += shard.size; } LLAMA_V2_ASSERT(offset == lt.size); - } else if (lt.split_type == SPLIT_BY_COLUMNS) { + } else if (lt.split_type == SPLIT_BY_COLUMNS_2) { // Let's load the data into temporary buffers to ensure the OS performs large loads. std::vector tmp_bufs(lt.shards.size()); for (size_t i = 0; i < lt.shards.size(); i++) { @@ -807,7 +807,7 @@ static bool kv_cache_init( const int64_t n_mem = n_layer*n_ctx; const int64_t n_elements = n_embd*n_mem; - cache.buf.resize(2u*n_elements*ggml_v2_type_size(wtype) + 2u*MB); + cache.buf.resize(2u*n_elements*ggml_v2_type_size(wtype) + 2u*MB_2); struct ggml_v2_init_params params; params.mem_size = cache.buf.size; @@ -888,12 +888,12 @@ static const char *llama_v2_ftype_name(enum llama_v2_ftype ftype) { } } -static const char *llama_v2_model_type_name(e_model type) { +static const char *llama_v2_model_type_name(e_model2 type) { switch (type) { - case MODEL_7B: return "7B"; - case MODEL_13B: return "13B"; - case MODEL_30B: return "30B"; - case MODEL_65B: return "65B"; + case MODEL_7B_2: return "7B"; + case MODEL_13B_2: return "13B"; + case MODEL_30B_2: return "30B"; + case MODEL_65B_2: return "65B"; default: printf("\nWARNING: NON-STANDARD LLAMA FILE DETECTED. DEFAULT TO 7B SIZE.\n"); return "UNKNOWN"; @@ -925,11 +925,11 @@ static void llama_v2_model_load_internal( { switch (hparams.n_layer) { - case 32: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_13B; break; - case 60: model.type = e_model::MODEL_30B; break; - case 80: model.type = e_model::MODEL_65B; break; - default: model.type = e_model::MODEL_UNKNOWN; break; + case 32: model.type = e_model2::MODEL_7B_2; break; + case 40: model.type = e_model2::MODEL_13B_2; break; + case 60: model.type = e_model2::MODEL_30B_2; break; + case 80: model.type = e_model2::MODEL_65B_2; break; + default: model.type = e_model2::MODEL_UNKNOWN_2; break; } hparams.n_ctx = n_ctx; @@ -985,13 +985,13 @@ static void llama_v2_model_load_internal( const size_t mem_required = ctx_size + mmapped_size + - MEM_REQ_SCRATCH0().at(model.type) + - MEM_REQ_SCRATCH1().at(model.type) + - MEM_REQ_EVAL().at(model.type); + MEM_REQ_SCRATCH0_2().at(model.type) + + MEM_REQ_SCRATCH1_2().at(model.type) + + MEM_REQ_EVAL_2().at(model.type); // this is the memory required by one llama_v2_state const size_t mem_required_state = - scale*MEM_REQ_KV_SELF().at(model.type); + scale*MEM_REQ_KV_SELF_2().at(model.type); fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); @@ -1462,7 +1462,7 @@ static bool llama_v2_eval_internal( // tokenizer // -static size_t utf8_len(char src) { +static size_t utf8_len2(char src) { const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; uint8_t highbits = static_cast(src) >> 4; return lookup[highbits]; @@ -1503,7 +1503,7 @@ struct llama_v2_tokenizer { size_t offs = 0; while (offs < text.size()) { llama_v2_sp_symbol sym; - size_t char_len = std::min(text.size() - offs, utf8_len(text[offs])); + size_t char_len = std::min(text.size() - offs, utf8_len2(text[offs])); sym.text = text.c_str() + offs; sym.n = char_len; offs += char_len; @@ -2251,10 +2251,10 @@ struct llama_v2_context * llama_v2_init_from_file( ctx->embedding.resize(hparams.n_embd); } - ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); + ctx->buf_compute.resize(MEM_REQ_EVAL_2().at(ctx->model.type)); - ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0().at(ctx->model.type)); - ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); + ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0_2().at(ctx->model.type)); + ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1_2().at(ctx->model.type)); } return ctx; @@ -3077,5 +3077,15 @@ std::vector legacy_llama_v2_tokenize(struct llama_v2_context * c int n = legacy_llama_v2_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); res.resize(n); + return res; +} + +std::vector llama_v2_tokenize(struct llama_v2_context * ctx, const std::string & text, bool add_bos) { + // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars + std::vector res(text.size() + (int) add_bos); + const int n = llama_v2_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos); + assert(n >= 0); + res.resize(n); + return res; } \ No newline at end of file diff --git a/otherarch/neox_v2.cpp b/otherarch/neox_v2.cpp index d6ac629b1..5748bc0dc 100644 --- a/otherarch/neox_v2.cpp +++ b/otherarch/neox_v2.cpp @@ -16,7 +16,7 @@ // load the model's weights from a file -ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) { +ModelLoadResult gpt_neox_v2_model_load(const std::string & fname, gpt_neox_v2_model & model, gpt_vocab & vocab, FileFormat file_format) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); @@ -344,7 +344,7 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & // feed-forward network ggml_v2_tensor * gpt_neox_ff( - const gpt_neox_layer &layer, + const gpt_neox_layer_v2 &layer, ggml_v2_context * ctx0, ggml_v2_tensor * inp) { ggml_v2_tensor * cur = ggml_v2_norm(ctx0, inp); @@ -386,8 +386,8 @@ ggml_v2_tensor * gpt_neox_ff( // - embd_inp: the embeddings of the tokens in the context // - embd_w: the predicted logits for the next token // -bool gpt_neox_eval( - const gpt_neox_model & model, +bool gpt_neox_v2_eval( + const gpt_neox_v2_model & model, const int n_threads, const int n_past, const std::vector & embd_inp, diff --git a/otherarch/neox_v3.cpp b/otherarch/neox_v3.cpp new file mode 100644 index 000000000..32f13399f --- /dev/null +++ b/otherarch/neox_v3.cpp @@ -0,0 +1,613 @@ +#include "ggml.h" +#include "otherarch.h" + +#include "utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +// load the model's weights from a file +ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) { + printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return ModelLoadResult::FAIL; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != 0x67676d6c) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return ModelLoadResult::FAIL; + } + } + + // load hparams + { + auto & hparams = model.hparams; + hparams.par_res = 1; //true + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); + if(file_format!=FileFormat::NEOX_1 && file_format!=FileFormat::NEOX_2 && file_format!=FileFormat::NEOX_3) + { + fin.read((char *) &hparams.par_res, sizeof(hparams.par_res)); + } + if(file_format==FileFormat::NEOX_3) + { + hparams.par_res = 0; + } + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: n_rot = %d\n", __func__, hparams.n_rot); + printf("%s: par_res = %d\n", __func__, hparams.par_res); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // load vocab + { + const int32_t n_vocab = model.hparams.n_vocab; + + std::string word; + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + word.resize(len); + fin.read((char *) word.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return ModelLoadResult::FAIL; + } + + auto & ctx = model.ctx; + + size_t ctx_size = 0; + + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g + ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b + + ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte + + ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g + //ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b + + ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w + ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b + + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w + ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b + + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g + ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w + ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b + + ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w + ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b + + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k + ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v + + ctx_size += (6 + 16*n_layer)*512; // object overhead + + printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); + } + + // create the ggml context + { + struct ggml_init_params params; + params.mem_size = ctx_size; + params.mem_buffer = NULL; + params.no_alloc = false; + + model.ctx = ggml_init(params); + if (!model.ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return ModelLoadResult::FAIL; + } + } + + // prepare memory for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_vocab = hparams.n_vocab; + + model.layers.resize(n_layer); + + model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + //model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab); + + // map by name + model.tensors["gpt_neox.embed_in.weight"] = model.wte; + + model.tensors["gpt_neox.final_layer_norm.weight"] = model.ln_f_g; + model.tensors["gpt_neox.final_layer_norm.bias"] = model.ln_f_b; + + model.tensors["embed_out.weight"] = model.lmh_g; + //model.tensors["lm_head.bias"] = model.lmh_b; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); + layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); + + layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // map by name + model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.weight"] = layer.ln_1_g; + model.tensors["gpt_neox.layers." + std::to_string(i) + ".input_layernorm.bias"] = layer.ln_1_b; + + model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.weight"] = layer.c_attn_attn_w; + model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.query_key_value.bias"] = layer.c_attn_attn_b; + + model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.weight"] = layer.c_attn_proj_w; + model.tensors["gpt_neox.layers." + std::to_string(i) + ".attention.dense.bias"] = layer.c_attn_proj_b; + + model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.weight"] = layer.ln_2_g; + model.tensors["gpt_neox.layers." + std::to_string(i) + ".post_attention_layernorm.bias"] = layer.ln_2_b; + + model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.weight"] = layer.c_mlp_fc_w; + model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_h_to_4h.bias"] = layer.c_mlp_fc_b; + + model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.weight"] = layer.c_mlp_proj_w; + model.tensors["gpt_neox.layers." + std::to_string(i) + ".mlp.dense_4h_to_h.bias"] = layer.c_mlp_proj_b; + } + } + + // key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int64_t n_mem = n_layer*n_ctx; + const int64_t n_elements = n_embd*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem); + } + + // load weights + { + int n_tensors = 0; + size_t total_size = 0; + + printf("%s: ", __func__); + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name.data()) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); + return ModelLoadResult::FAIL; + } + + auto tensor = model.tensors[name.data()]; + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); + return ModelLoadResult::FAIL; + } + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n", + __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); + return ModelLoadResult::FAIL; + } + + // for debugging + if (0) { + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + size_t bpe = ggml_type_size(ggml_type(ttype)); + + if(file_format==FileFormat::NEOX_1) + { + switch (ttype) { + case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; + case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; + case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; + case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; + default: + { + fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ttype); + return ModelLoadResult::FAIL; + } + }; + } + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); + ggml_free(ctx); + return ModelLoadResult::RETRY_LOAD; + } + + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + + total_size += ggml_nbytes(tensor); + if (++n_tensors % 8 == 0) { + printf("."); + fflush(stdout); + } + } + + printf(" done\n"); + + printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); + } + + fin.close(); + + return ModelLoadResult::SUCCESS; +} + + +// feed-forward network +ggml_tensor * gpt_neox_ff( + const gpt_neox_layer &layer, + ggml_context * ctx0, + ggml_tensor * inp) { + ggml_tensor * cur = ggml_norm(ctx0, inp); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, layer.ln_2_g, cur), + cur), + ggml_repeat(ctx0, layer.ln_2_b, cur)); + + cur = ggml_mul_mat(ctx0, + layer.c_mlp_fc_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, layer.c_mlp_fc_b, cur), + cur); + + // GELU activation + cur = ggml_gelu(ctx0, cur); + + // projection + // cur = proj_w*cur + proj_b + cur = ggml_mul_mat(ctx0, + layer.c_mlp_proj_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, layer.c_mlp_proj_b, cur), + cur); + return cur; +} + +// evaluate the transformer +// +// - model: the model +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool gpt_neox_eval( + const gpt_neox_model & model, + const int n_threads, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w, + size_t & mem_per_token) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + const int n_vocab = hparams.n_vocab; + const int n_rot = hparams.n_rot; + + static size_t buf_size = 256u*1024*1024; + static void * buf = malloc(buf_size); + + if (mem_per_token > 0 && (mem_per_token*N*2 + 64u*1024*1024) > buf_size) { + const size_t buf_size_new = 360u*1024*1024 + 2*(mem_per_token*N); // add 10% to account for ggml object overhead + //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); + + // reallocate + buf_size = buf_size_new; + buf = realloc(buf, buf_size); + if (buf == nullptr) { + fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); + return false; + } + } + + struct ggml_init_params params; + params.mem_size = buf_size; + params.mem_buffer = buf; + params.no_alloc = false; + + + struct ggml_context * ctx0 = ggml_init(params); + struct ggml_cgraph gf = {}; + gf.n_threads = n_threads; + + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + + // wte + struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + // self-attention + { + { + cur = ggml_norm(ctx0, inpL); + + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.layers[il].ln_1_g, cur), + cur), + ggml_repeat(ctx0, model.layers[il].ln_1_b, cur)); + } + + // compute QKV + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_attn_w, + cur); + + cur = ggml_add(ctx0, + ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur), + cur); + } + + struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 0*sizeof(float)*n_embd/n_head)); + struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 1*sizeof(float)*n_embd/n_head)); + struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_3d(ctx0, cur, n_embd/n_head, n_head, N, cur->nb[1]/n_head, cur->nb[1], 2*sizeof(float)*n_embd/n_head)); + + // using mode = 2 for GPT-NeoX mode + Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, n_rot, 2); + Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, n_rot, 2); + + // store key and value to memory + { + Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd, N)); + + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_2d(ctx0, model.memory_v, N, n_embd, + ( n_ctx)*ggml_element_size(model.memory_v), + (il*n_ctx)*ggml_element_size(model.memory_v)*n_embd + n_past*ggml_element_size(model.memory_v)); + + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + struct ggml_tensor * Q = + ggml_permute(ctx0, + Qcur, + 0, 2, 1, 3); + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + struct ggml_tensor * KQ_scaled = + ggml_scale_inplace(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) + ); + + // KQ_masked = mask_past(KQ_scaled) + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + struct ggml_tensor * V = + ggml_view_3d(ctx0, model.memory_v, + n_past + N, n_embd/n_head, n_head, + n_ctx*ggml_element_size(model.memory_v), + n_ctx*ggml_element_size(model.memory_v)*n_embd/n_head, + il*n_ctx*ggml_element_size(model.memory_v)*n_embd); + + // KQV = transpose(V) * KQ_soft_max + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + + // cur = KQV_merged.contiguous().view(n_embd, N) + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + + // projection + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + + cur = ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur), cur); + } + } + + if (hparams.par_res == 0) { + struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL); + + cur = gpt_neox_ff(model.layers[il], ctx0, inpFF); + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + } else { + struct ggml_tensor * inpFF = cur; + + // this is independent of the self-attention result, so it could be done in parallel to the self-attention + // note here we pass inpL instead of cur + cur = gpt_neox_ff(model.layers[il], ctx0, inpL); + + // layer input + FF + cur = ggml_add(ctx0, cur, inpFF); + + // input for next layer + inpL = ggml_add(ctx0, cur, inpL); + } + } + + // norm + { + inpL = ggml_norm(ctx0, inpL); + + // inpL = ln_f_g*inpL + ln_f_b + inpL = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, model.ln_f_g, inpL), + inpL), + ggml_repeat(ctx0, model.ln_f_b, inpL)); + } + + // lm_head + { + inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL); + + //inpL = ggml_add(ctx0, + // ggml_repeat(ctx0, model.lmh_b, inpL), + // inpL); + } + + // logits -> probs + //inpL = ggml_soft_max_inplace(ctx0, inpL); + + // run the computation + ggml_build_forward_expand(&gf, inpL); + ggml_graph_compute (ctx0, &gf); + + //if (n_past%100 == 0) { + // ggml_graph_print (&gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + //embd_w.resize(n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + + // return result for just the last token + embd_w.resize(n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + + if (mem_per_token == 0) { + mem_per_token = ggml_used_mem(ctx0)/N; + } + //printf("used_mem = %zu\n", ggml_used_mem(ctx0)); + + ggml_free(ctx0); + + return true; +} \ No newline at end of file diff --git a/otherarch/otherarch.h b/otherarch/otherarch.h index 3644dc006..83e558270 100644 --- a/otherarch/otherarch.h +++ b/otherarch/otherarch.h @@ -87,7 +87,7 @@ struct gptj_layer_v1 { struct ggml_v1_tensor * c_mlp_proj_b; }; -struct gptj_model_v1 { +struct gptj_v1_model { gptj_hparams hparams; // normalization @@ -110,7 +110,7 @@ struct gptj_model_v1 { std::map tensors; }; -struct gptj_model_v2 { +struct gptj_v2_model { gptj_hparams hparams; // normalization @@ -122,7 +122,7 @@ struct gptj_model_v2 { struct ggml_v2_tensor * lmh_g; // language model head struct ggml_v2_tensor * lmh_b; // language model bias - std::vector layers; + std::vector layers; // key + value memory struct ggml_v2_tensor * memory_k;