From 0711a5f6dce7f04c2a791b14bc47f7d4cb545408 Mon Sep 17 00:00:00 2001 From: Aaron Miller Date: Sat, 17 Jun 2023 07:37:49 -0700 Subject: [PATCH 01/40] metal : add norm, cpy f16->f16, alibi kernels (#1823) --- ggml-metal.m | 73 +++++++++++++++++++++++ ggml-metal.metal | 149 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 222 insertions(+) diff --git a/ggml-metal.m b/ggml-metal.m index 0e9b56aa3..814851203 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -57,6 +57,7 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(get_rows_q5_k); GGML_METAL_DECL_KERNEL(get_rows_q6_k); GGML_METAL_DECL_KERNEL(rms_norm); + GGML_METAL_DECL_KERNEL(norm); GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); @@ -66,8 +67,10 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32); GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32); GGML_METAL_DECL_KERNEL(rope); + GGML_METAL_DECL_KERNEL(alibi_f32); GGML_METAL_DECL_KERNEL(cpy_f32_f16); GGML_METAL_DECL_KERNEL(cpy_f32_f32); + GGML_METAL_DECL_KERNEL(cpy_f16_f16); #undef GGML_METAL_DECL_KERNEL }; @@ -162,6 +165,7 @@ struct ggml_metal_context * ggml_metal_init(void) { GGML_METAL_ADD_KERNEL(get_rows_q5_k); GGML_METAL_ADD_KERNEL(get_rows_q6_k); GGML_METAL_ADD_KERNEL(rms_norm); + GGML_METAL_ADD_KERNEL(norm); GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); @@ -171,8 +175,10 @@ struct ggml_metal_context * ggml_metal_init(void) { GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32); GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32); GGML_METAL_ADD_KERNEL(rope); + GGML_METAL_ADD_KERNEL(alibi_f32); GGML_METAL_ADD_KERNEL(cpy_f32_f16); GGML_METAL_ADD_KERNEL(cpy_f32_f32); + GGML_METAL_ADD_KERNEL(cpy_f16_f16); #undef GGML_METAL_ADD_KERNEL } @@ -735,6 +741,65 @@ void ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; + case GGML_OP_NORM: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + const float eps = 1e-5f; + + const int nth = 256; + + [encoder setComputePipelineState:ctx->pipeline_norm]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setBytes:&eps length:sizeof( float) atIndex:4]; + [encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0]; + + const int64_t nrows = ggml_nrows(src0); + + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; + case GGML_OP_ALIBI: + { + GGML_ASSERT((src0t == GGML_TYPE_F32)); + const int n_past = ((int32_t *) src1->data)[0]; + const int n_head = ((int32_t *) src1->data)[1]; + const float max_bias = ((float *) src1->data)[2]; + if (__builtin_popcount(n_head) != 1) { + GGML_ASSERT(false && "only power-of-two n_head implemented"); + } + const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + [encoder setComputePipelineState:ctx->pipeline_alibi_f32]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14]; + [encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15]; + [encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16]; + [encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&m0 length:sizeof( float) atIndex:18]; + const int nth = 32; + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; + } break; case GGML_OP_ROPE: { if (encoder == nil) { @@ -788,6 +853,14 @@ void ggml_metal_graph_compute( default: GGML_ASSERT(false && "not implemented"); }; } break; + case GGML_TYPE_F16: + { + switch (dstt) { + case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break; + case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break; + default: GGML_ASSERT(false && "not implemented"); + }; + } break; default: GGML_ASSERT(false && "not implemented"); } diff --git a/ggml-metal.metal b/ggml-metal.metal index 09e12a879..d1e49222d 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -256,6 +256,72 @@ kernel void kernel_get_rows_q4_1( (device float *) ((device char *) dst + i*nb1), ne00); } +kernel void kernel_norm( + device const void * src0, + device float * dst, + constant int64_t & ne00, + constant uint64_t & nb01, + constant float & eps, + threadgroup float * sum [[threadgroup(0)]], + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint ntg[[threads_per_threadgroup]]) { + device const float * x = (device const float *) ((device const char *) src0 + tgpig*nb01); + // MEAN + // parallel sum + sum[tpitg] = 0.0f; + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + sum[tpitg] += x[i00]; + } + // reduce + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint i = ntg/2; i > 0; i /= 2) { + if (tpitg < i) { + sum[tpitg] += sum[tpitg + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + // broadcast + if (tpitg == 0) { + sum[0] /= ne00; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + const float mean = sum[0]; + + // recenter + device float * y = dst + tgpig*ne00; + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + y[i00] = x[i00] - mean; + } + + // VARIANCE + // parallel sum + sum[tpitg] = 0.0f; + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + sum[tpitg] += y[i00] * y[i00]; + } + // reduce + threadgroup_barrier(mem_flags::mem_threadgroup); + for (uint i = ntg/2; i > 0; i /= 2) { + if (tpitg < i) { + sum[tpitg] += sum[tpitg + i]; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + } + // broadcast + if (tpitg == 0) { + sum[0] /= ne00; + } + threadgroup_barrier(mem_flags::mem_threadgroup); + const float variance = sum[0]; + + const float scale = 1.0f/sqrt(variance + eps); + for (int i00 = tpitg; i00 < ne00; i00 += ntg) { + y[i00] = y[i00] * scale; + } +} + + kernel void kernel_rms_norm( device const void * src0, device float * dst, @@ -485,6 +551,48 @@ kernel void kernel_mul_mat_f16_f32( } } +kernel void kernel_alibi_f32( + device const float * src0, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + constant float & m0, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + + device float * dst_data = (device float *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + float m_k = pow(m0, i2 + 1); + for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { + device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + dst_data[i00] = src[0] + m_k * (i00 - ne00 + 1); + } +} + kernel void kernel_rope( device const void * src0, device float * dst, @@ -540,6 +648,47 @@ kernel void kernel_rope( } } +kernel void kernel_cpy_f16_f16( + device const half * src0, + device half * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant uint64_t & nb0, + constant uint64_t & nb1, + constant uint64_t & nb2, + constant uint64_t & nb3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]]) { + const int64_t i03 = tgpig[2]; + const int64_t i02 = tgpig[1]; + const int64_t i01 = tgpig[0]; + + const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + + const int64_t i3 = n / (ne2*ne1*ne0); + const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0); + const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0; + const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0); + + device half * dst_data = (device half *) ((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + + for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { + device const half * src = (device half *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); + dst_data[i00] = src[0]; + } +} + kernel void kernel_cpy_f32_f16( device const float * src0, device half * dst, From 3d59ec5935ea1d33e9d51060a8dd737169b9b89b Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sat, 17 Jun 2023 23:46:15 +0800 Subject: [PATCH 02/40] ggml : fix warnings under MSVC (#1908) --- ggml-cuda.cu | 4 ++++ ggml-opencl.cpp | 4 ++++ llama.cpp | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7edd1a9f8..fed2a7ce1 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -13,6 +13,10 @@ #include "ggml-cuda.h" #include "ggml.h" +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); #define CUDA_CHECK(err) \ diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 1d4db96ee..95f4cec6d 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -15,6 +15,10 @@ #include "ggml.h" +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + #define CL_DMMV_BLOCK_SIZE 32 #define MULTILINE_QUOTE(...) #__VA_ARGS__ diff --git a/llama.cpp b/llama.cpp index 81f047ed2..a50846f71 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1253,7 +1253,7 @@ static void llama_model_load_internal( vram_scratch = n_batch * MB; ggml_cuda_set_scratch_size(vram_scratch); if (n_gpu_layers > 0) { - fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n", + fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n", __func__, vram_scratch / MB); } } From 86c7571864ff331f8cdb9e092f3abeb123729a56 Mon Sep 17 00:00:00 2001 From: DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com> Date: Sat, 17 Jun 2023 18:17:22 +0200 Subject: [PATCH 03/40] make : update for latest Arch (#1701) With the upcoming change to the openblas package in arch the Makefile workaround is no longer needed. --- Makefile | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Makefile b/Makefile index eee9eeb53..72d6ad40c 100644 --- a/Makefile +++ b/Makefile @@ -144,11 +144,7 @@ endif # LLAMA_NO_ACCELERATE ifdef LLAMA_OPENBLAS CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas - ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),) - LDFLAGS += -lopenblas -lcblas - else - LDFLAGS += -lopenblas - endif + LDFLAGS += -lopenblas endif # LLAMA_OPENBLAS ifdef LLAMA_BLIS From 051e1b0e6a6e3aee7d989b47760980e6fda5861c Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 17 Jun 2023 19:30:22 +0300 Subject: [PATCH 04/40] llama : fix kv_cache `n` init (close #1903) --- .gitignore | 1 + examples/CMakeLists.txt | 1 + llama.cpp | 2 ++ 3 files changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index e68fd724a..e7bfd52e3 100644 --- a/.gitignore +++ b/.gitignore @@ -34,6 +34,7 @@ models/* /perplexity /embedding /train-text-from-scratch +/simple /benchmark-matmult /vdot /server diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index de005f3e3..cf9c4a223 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -38,6 +38,7 @@ else() add_subdirectory(benchmark) add_subdirectory(baby-llama) add_subdirectory(train-text-from-scratch) + add_subdirectory(simple) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/llama.cpp b/llama.cpp index a50846f71..a2916b3e8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -886,6 +886,7 @@ static bool kv_cache_init( const int64_t n_elements = n_embd*n_mem; cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + cache.n = 0; struct ggml_init_params params; params.mem_size = cache.buf.size; @@ -904,6 +905,7 @@ static bool kv_cache_init( ggml_set_name(cache.k, "cache_k"); ggml_set_name(cache.v, "cache_v"); + (void) n_gpu_layers; #ifdef GGML_USE_CUBLAS if (n_gpu_layers > n_layer + 1) { ggml_cuda_assign_buffers_no_scratch(cache.v); From 2c9380dd2f77e41149340f3ecb09764d793b16db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 17 Jun 2023 19:15:02 +0200 Subject: [PATCH 05/40] Only one CUDA stream per device for async compute (#1898) --- README.md | 1 - examples/common.cpp | 3 --- ggml-cuda.cu | 54 +++++++++++++++++---------------------------- 3 files changed, 20 insertions(+), 38 deletions(-) diff --git a/README.md b/README.md index b9759b00b..7defb7584 100644 --- a/README.md +++ b/README.md @@ -336,7 +336,6 @@ Building the program with BLAS support may lead to some performance improvements cmake .. -DLLAMA_CUBLAS=ON cmake --build . --config Release ``` - Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1. The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. diff --git a/examples/common.cpp b/examples/common.cpp index 055383bef..fed24e027 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -106,9 +106,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } if (arg == "-s" || arg == "--seed") { -#if defined(GGML_USE_CUBLAS) - fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n"); -#endif if (++i >= argc) { invalid_param = true; break; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index fed2a7ce1..16488b9f9 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1467,19 +1467,13 @@ static void * g_scratch_buffer = nullptr; static size_t g_scratch_size = 1024*1024*1024; // 1 GB by default static size_t g_scratch_offset = 0; -#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication. -#define GGML_CUDA_MAX_EVENTS 64 - static int g_device_count = -1; static int g_main_device = 0; static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; -static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr }; - -static cudaStream_t g_cudaStreams_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { nullptr }; -static cudaEvent_t g_cudaEvents_memcpy_src1[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_EVENTS] = { nullptr }; +static cudaStream_t g_cudaStreams_main[GGML_CUDA_MAX_DEVICES] = { nullptr }; void ggml_init_cublas() { static bool initialized = false; @@ -1503,15 +1497,8 @@ void ggml_init_cublas() { for (int id = 0; id < g_device_count; ++id) { CUDA_CHECK(cudaSetDevice(id)); - // create streams - for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) { - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id][i], cudaStreamNonBlocking)); - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_memcpy_src1[id][i], cudaStreamNonBlocking)); - } - // create events - for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) { - CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents_memcpy_src1[id][i], cudaEventDisableTiming)); - } + // create main stream + CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams_main[id], cudaStreamNonBlocking)); // create cublas handle CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id])); @@ -1978,6 +1965,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0}; size_t dst_asf[GGML_CUDA_MAX_DEVICES] = {0}; + // if multiple GPUs are used they need to wait for the main GPU to finish + if (split && g_device_count > 1) { + CUDA_CHECK(cudaSetDevice(g_main_device)); + CUDA_CHECK(cudaDeviceSynchronize()); + } + for (int id = 0; id < g_device_count; ++id) { if (!split && id != g_main_device) { continue; @@ -2076,9 +2069,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm } const int64_t i11 = i13*ne12 + i12; - cudaStream_t cudaStream_main = g_cudaStreams_main[id][i0 % GGML_CUDA_MAX_STREAMS]; - cudaStream_t cudaStream_memcpy_src1 = g_cudaStreams_memcpy_src1[id][i0 % GGML_CUDA_MAX_STREAMS]; - cudaEvent_t cudaEvent_memcpy_src1 = g_cudaEvents_memcpy_src1[id][i0 % GGML_CUDA_MAX_EVENTS]; + cudaStream_t cudaStream_main = g_cudaStreams_main[id]; // for split tensors the data begins at i0 == i0_offset_low char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs; @@ -2106,14 +2097,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm if (src1->backend == GGML_BACKEND_CPU) { GGML_ASSERT(!flatten_rows || nrows0 == ggml_nrows(src1)); int64_t nrows1 = flatten_rows ? nrows0 : ne11; - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_memcpy_src1)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf_i, src1, i03, i02, 0, nrows1, cudaStream_main)); } else if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) { if (id != g_main_device) { GGML_ASSERT(!flatten_rows); float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device]; src1_ddf_i_source += i11*src1_stride; CUDA_CHECK(cudaMemcpyAsync(src1_ddf_i, src1_ddf_i_source, src1_stride*sizeof(float), - cudaMemcpyDeviceToDevice, cudaStream_memcpy_src1)); + cudaMemcpyDeviceToDevice, cudaStream_main)); } } else if (src1_on_device && !src1_is_contiguous) { GGML_ASSERT(!split); @@ -2122,7 +2113,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm GGML_ASSERT(false); } } - CUDA_CHECK(cudaEventRecord(cudaEvent_memcpy_src1, cudaStream_memcpy_src1)); if (!src0_on_device || !src0_is_contiguous) { if (src0_is_f32) { @@ -2138,9 +2128,6 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm CUDA_CHECK(cudaGetLastError()); } - // wait with main stream until src1 memcpy is done - CUDA_CHECK(cudaStreamWaitEvent(cudaStream_main, cudaEvent_memcpy_src1, 0)); - // do the computation op(src0, src1, dst, src0_ddq_i, src0_ddf_i, src1_ddf_i, dst_ddf_i, i02, i01_low, i01_high, i11, cudaStream_main); @@ -2178,8 +2165,13 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm // wait until each device is finished, then free their buffers for (int id = 0; id < g_device_count; ++id) { + if (src0_asq[id] == 0 && src0_asf[id] == 0 && src1_asf[id] == 0 && dst_asf[id] == 0) { + continue; + } + CUDA_CHECK(cudaSetDevice(id)); CUDA_CHECK(cudaDeviceSynchronize()); + if (src0_asq[id] > 0) { ggml_cuda_pool_free(src0_ddq[id], src0_asq[id]); } @@ -2245,7 +2237,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr const int64_t ne02 = src0->ne[2]; CUDA_CHECK(cudaSetDevice(g_main_device)); - cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0]; + cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; @@ -2257,8 +2249,6 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main); - - CUDA_CHECK(cudaDeviceSynchronize()); } void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ @@ -2276,7 +2266,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 const int64_t nb02 = src0->nb[2]; CUDA_CHECK(cudaSetDevice(g_main_device)); - cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0]; + cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; @@ -2291,8 +2281,6 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 const int channel_stride_x = nb02 / sizeof(half); ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main); - - CUDA_CHECK(cudaDeviceSynchronize()); } void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -2348,7 +2336,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens const int64_t nb12 = src1->nb[2]; CUDA_CHECK(cudaSetDevice(g_main_device)); - cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device][0]; + cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; @@ -2366,8 +2354,6 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens GGML_ASSERT(false); } - CUDA_CHECK(cudaDeviceSynchronize()); - (void) dst; } From 4f9c43e3bd488b7561119785485e1155dba338d7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 17 Jun 2023 20:24:11 +0300 Subject: [PATCH 06/40] minor : warning fixes --- examples/main/main.cpp | 2 +- ggml-metal.m | 27 ++++++++++++++++----------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index a051fcbc5..941312f9c 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -354,7 +354,7 @@ int main(int argc, char ** argv) { if ((int)embd.size() > max_embd_size) { auto skipped_tokens = embd.size() - max_embd_size; console_set_color(con_st, CONSOLE_COLOR_ERROR); - printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); console_set_color(con_st, CONSOLE_COLOR_DEFAULT); fflush(stdout); embd.resize(max_embd_size); diff --git a/ggml-metal.m b/ggml-metal.m index 814851203..07da62a25 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -256,10 +256,10 @@ bool ggml_metal_add_buffer( if (ctx->buffers[ctx->n_buffers].metal == nil) { fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0); return false; - } else { - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0); } + fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0); + ++ctx->n_buffers; } @@ -765,18 +765,23 @@ void ggml_metal_graph_compute( } break; case GGML_OP_ALIBI: { - GGML_ASSERT((src0t == GGML_TYPE_F32)); - const int n_past = ((int32_t *) src1->data)[0]; - const int n_head = ((int32_t *) src1->data)[1]; - const float max_bias = ((float *) src1->data)[2]; - if (__builtin_popcount(n_head) != 1) { - GGML_ASSERT(false && "only power-of-two n_head implemented"); - } - const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); - const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); if (encoder == nil) { encoder = [command_buffer computeCommandEncoder]; } + + GGML_ASSERT((src0t == GGML_TYPE_F32)); + + const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past); + const int n_head = ((int32_t *) src1->data)[1]; + const float max_bias = ((float *) src1->data)[2]; + + if (__builtin_popcount(n_head) != 1) { + GGML_ASSERT(false && "only power-of-two n_head implemented"); + } + + const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + [encoder setComputePipelineState:ctx->pipeline_alibi_f32]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; From b2416493ab3ab21686d47c96669da6d6c6af08a4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 17 Jun 2023 20:55:03 +0300 Subject: [PATCH 07/40] make : do not print help for simple example --- Makefile | 3 --- 1 file changed, 3 deletions(-) diff --git a/Makefile b/Makefile index 72d6ad40c..cf590862b 100644 --- a/Makefile +++ b/Makefile @@ -276,9 +276,6 @@ main: examples/main/main.cpp build-info.h ggml. simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) - @echo - @echo '==== Run ./simple -h for help. ====' - @echo quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) From 57cd69460f736031a3fc54af1e97c03f80128478 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Sun, 18 Jun 2023 12:29:47 +0800 Subject: [PATCH 08/40] cmake : add CUDA_ARCHITECTURES to new target ggml_static (#1917) --- CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f5a968533..736771954 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -492,6 +492,10 @@ if (GGML_SOURCES_CUDA) message(STATUS "GGML CUDA sources found, configuring CUDA architecture") set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF) set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") + + set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES OFF) + set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") + set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF) endif() From ce2c7d72e2d06988b5ddec6811ab923254542077 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 18 Jun 2023 09:09:47 +0300 Subject: [PATCH 09/40] metal : handle buffers larger than device's maxBufferLength (#1826) * metal : handle buffers larger than device's maxBufferLength * metal : print more verbose device info + handle errors * metal : fix prints for overlapping views * metal : minimize view overlap to try to utilize device memory better --- Makefile | 2 +- ggml-metal.h | 5 ++- ggml-metal.m | 98 ++++++++++++++++++++++++++++++++++++++++++---------- ggml.c | 24 +++++++++++-- ggml.h | 5 +-- llama.cpp | 26 ++++++++------ 6 files changed, 125 insertions(+), 35 deletions(-) diff --git a/Makefile b/Makefile index cf590862b..afd06e0a6 100644 --- a/Makefile +++ b/Makefile @@ -252,7 +252,7 @@ $(info ) ggml.o: ggml.c ggml.h ggml-cuda.h $(CC) $(CFLAGS) -c $< -o $@ -llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h +llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h diff --git a/ggml-metal.h b/ggml-metal.h index 033c4d86a..b9e50ac74 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx); // - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute // - the mapping is used during computation to determine the arguments of the compute kernels // - you don't need to keep the host memory buffer allocated as it is never accessed by Metal +// - max_size specifies the maximum size of a tensor and is used to create shared views such +// that it is guaranteed that the tensor will fit in at least one of the views // bool ggml_metal_add_buffer( struct ggml_metal_context * ctx, const char * name, void * data, - size_t size); + size_t size, + size_t max_size); // set data from host memory into the device void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); diff --git a/ggml-metal.m b/ggml-metal.m index 07da62a25..a7e104dc7 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -183,6 +183,14 @@ struct ggml_metal_context * ggml_metal_init(void) { #undef GGML_METAL_ADD_KERNEL } + fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); + fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); + if (ctx->device.maxTransferRate != 0) { + fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); + } else { + fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__); + } + return ctx; } @@ -199,10 +207,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { static id ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) { //fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach); + const int64_t tsize = ggml_nbytes(t); + + // find the view that contains the tensor fully for (int i = 0; i < ctx->n_buffers; ++i) { const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data; - if (ioffs >= 0 && ioffs < (int64_t) ctx->buffers[i].size) { + if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) { *offs = (size_t) ioffs; //fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs); @@ -220,7 +231,8 @@ bool ggml_metal_add_buffer( struct ggml_metal_context * ctx, const char * name, void * data, - size_t size) { + size_t size, + size_t max_size) { if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) { fprintf(stderr, "%s: too many buffers\n", __func__); return false; @@ -237,30 +249,68 @@ bool ggml_metal_add_buffer( } } - size_t page_size = getpagesize(); - size_t aligned_size = size; - if ((aligned_size % page_size) != 0) { - aligned_size += (page_size - (aligned_size % page_size)); + const size_t size_page = getpagesize(); + + size_t size_aligned = size; + if ((size_aligned % size_page) != 0) { + size_aligned += (size_page - (size_aligned % size_page)); } - ctx->buffers[ctx->n_buffers].name = name; - ctx->buffers[ctx->n_buffers].data = data; - ctx->buffers[ctx->n_buffers].size = size; + // the buffer fits into the max buffer size allowed by the device + if (size_aligned <= ctx->device.maxBufferLength) { + ctx->buffers[ctx->n_buffers].name = name; + ctx->buffers[ctx->n_buffers].data = data; + ctx->buffers[ctx->n_buffers].size = size; - if (ctx->device.maxBufferLength < aligned_size) { - fprintf(stderr, "%s: buffer '%s' size %zu is larger than buffer maximum of %zu\n", __func__, name, aligned_size, ctx->device.maxBufferLength); - return false; - } - ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:aligned_size options:MTLResourceStorageModeShared deallocator:nil]; + ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; - if (ctx->buffers[ctx->n_buffers].metal == nil) { - fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0); - return false; + if (ctx->buffers[ctx->n_buffers].metal == nil) { + fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); + return false; + } + + fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); + + ++ctx->n_buffers; + } else { + // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into + // one of the views + const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case + const size_t size_step = ctx->device.maxBufferLength - size_ovlp; + const size_t size_view = ctx->device.maxBufferLength; + + for (size_t i = 0; i < size; i += size_step) { + const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i); + + ctx->buffers[ctx->n_buffers].name = name; + ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i); + ctx->buffers[ctx->n_buffers].size = size_step_aligned; + + ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; + + if (ctx->buffers[ctx->n_buffers].metal == nil) { + fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); + return false; + } + + fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); + if (i + size_step < size) { + fprintf(stderr, "\n"); + } + + ++ctx->n_buffers; + } } - fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB\n", __func__, name, aligned_size / 1024.0 / 1024.0); + fprintf(stderr, ", (%8.2f / %8.2f)", + ctx->device.currentAllocatedSize / 1024.0 / 1024.0, + ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); - ++ctx->n_buffers; + if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { + fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n"); + } else { + fprintf(stderr, "\n"); + } } return true; @@ -909,4 +959,14 @@ void ggml_metal_graph_compute( dispatch_barrier_sync(queue, ^{}); [command_buffers[n_cb - 1] waitUntilCompleted]; + + // check status of command buffers + // needed to detect if the device ran out-of-memory for example (#1881) + for (int i = 0; i < n_cb; i++) { + MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status]; + if (status != MTLCommandBufferStatusCompleted) { + fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status); + GGML_ASSERT(false); + } + } } diff --git a/ggml.c b/ggml.c index 0eda7f338..78c365354 100644 --- a/ggml.c +++ b/ggml.c @@ -4154,14 +4154,34 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) { ctx->no_alloc = no_alloc; } -void * ggml_get_mem_buffer(struct ggml_context * ctx) { +void * ggml_get_mem_buffer(const struct ggml_context * ctx) { return ctx->mem_buffer; } -size_t ggml_get_mem_size(struct ggml_context * ctx) { +size_t ggml_get_mem_size(const struct ggml_context * ctx) { return ctx->mem_size; } +size_t ggml_get_max_tensor_size(const struct ggml_context * ctx) { + size_t max_size = 0; + + struct ggml_object * obj = ctx->objects_begin; + + while (obj != NULL) { + struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs); + + const size_t size = ggml_nbytes(tensor); + + if (max_size < size) { + max_size = size; + } + + obj = obj->next; + } + + return max_size; +} + // IMPORTANT: // when creating "opt" tensors, always save and load the scratch buffer // this is an error prone process, but it is necessary to support inplace diff --git a/ggml.h b/ggml.h index 9b0c846f8..1380c530f 100644 --- a/ggml.h +++ b/ggml.h @@ -500,8 +500,9 @@ extern "C" { GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); - GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx); - GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx); + GGML_API void * ggml_get_mem_buffer (const struct ggml_context * ctx); + GGML_API size_t ggml_get_mem_size (const struct ggml_context * ctx); + GGML_API size_t ggml_get_max_tensor_size(const struct ggml_context * ctx); GGML_API struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, diff --git a/llama.cpp b/llama.cpp index a2916b3e8..c165d3239 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2696,16 +2696,21 @@ struct llama_context * llama_init_from_file( // this allocates all Metal resources and memory buffers ctx->ctx_metal = ggml_metal_init(); - void *data_ptr = NULL; + void * data_ptr = NULL; size_t data_size = 0; + if (params.use_mmap) { - data_ptr = ctx->model.mapping->addr; - data_size= ctx->model.mapping->size; + data_ptr = ctx->model.mapping->addr; + data_size = ctx->model.mapping->size; } else { - data_ptr = ggml_get_mem_buffer(ctx->model.ctx); - data_size= ggml_get_mem_size(ctx->model.ctx); + data_ptr = ggml_get_mem_buffer(ctx->model.ctx); + data_size = ggml_get_mem_size (ctx->model.ctx); } + const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); + + printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + #define LLAMA_METAL_CHECK_BUF(result) \ if (!(result)) { \ fprintf(stderr, "%s: failed to add buffer\n", __func__); \ @@ -2713,12 +2718,13 @@ struct llama_context * llama_init_from_file( return NULL; \ } - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0)); + + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); #undef LLAMA_METAL_CHECK_BUF } #endif From 90cc59d6ab1363a5c69c60c4b94db647d3a54a18 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Sun, 18 Jun 2023 10:52:10 +0300 Subject: [PATCH 10/40] examples : fix examples/metal (#1920) Co-authored-by: Iwan Kawrakow --- examples/metal/metal.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/metal/metal.cpp b/examples/metal/metal.cpp index 77aca94a3..cdfe4bfe9 100644 --- a/examples/metal/metal.cpp +++ b/examples/metal/metal.cpp @@ -40,8 +40,10 @@ int main(int argc, char ** argv) { // this allocates all Metal resources and memory buffers auto * ctx_metal = ggml_metal_init(); - ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data)); - ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval)); + const size_t max_size_data = ggml_get_max_tensor_size(ctx_data); + const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval); + ggml_metal_add_buffer(ctx_metal, "data", ggml_get_mem_buffer(ctx_data), ggml_get_mem_size(ctx_data), max_size_data); + ggml_metal_add_buffer(ctx_metal, "eval", ggml_get_mem_buffer(ctx_eval), ggml_get_mem_size(ctx_eval), max_size_eval); // main { From 8ab8ba62eb27cc340be2edf3418e051b1d967416 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Sun, 18 Jun 2023 11:13:43 +0300 Subject: [PATCH 11/40] llama : prevent usage of k-quants when tensor size is not a multiple of 256 (#1921) * Fix examples/metal * k-quants: prevent usage when tensor size is not divisible by 256 --------- Co-authored-by: Iwan Kawrakow --- llama.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/llama.cpp b/llama.cpp index c165d3239..dfbb85a68 100644 --- a/llama.cpp +++ b/llama.cpp @@ -19,6 +19,11 @@ #ifdef GGML_USE_METAL #include "ggml-metal.h" #endif +#ifdef GGML_USE_K_QUANTS +#ifndef QK_K +#define QK_K 256 +#endif +#endif #include #include @@ -2491,6 +2496,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else { new_type = quantized_type; #ifdef GGML_USE_K_QUANTS + if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K || + quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) { + int nx = tensor.ne.at(0); + int ny = tensor.ne.at(0); + if (nx % QK_K != 0 || ny % QK_K != 0) { + fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K); + fprintf(stderr, "This is required to be able to use k-quants for now!\n"); + fprintf(stderr, "========================================================================================\n\n"); + throw std::runtime_error("Unsupported tensor size encountered\n"); + } + } if (tensor.name == "output.weight") { new_type = GGML_TYPE_Q6_K; } else if (tensor.name.find("attention.wv.weight") != std::string::npos) { From e1886cf4fe0d0f31661dda52a4a9f34bd9b9009a Mon Sep 17 00:00:00 2001 From: Mike Date: Sun, 18 Jun 2023 16:28:26 +0800 Subject: [PATCH 12/40] readme : update Android build instructions (#1922) Add steps for using termux on android devices to prevent common errors. --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7defb7584..e5b3f59b3 100644 --- a/README.md +++ b/README.md @@ -617,7 +617,12 @@ And after 4.45 hours, you will have the final perplexity. #### Building the Project using Android NDK You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/). -First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake: + +First, install the essential packages for termux: +``` +pkg install clang wget git cmake +``` +Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake: ``` $ mkdir build-android $ cd build-android From 8596af427722775f0df4a7c90b9af067ba90d4ef Mon Sep 17 00:00:00 2001 From: l3utterfly Date: Sun, 18 Jun 2023 19:19:16 +0800 Subject: [PATCH 13/40] ggml : fix bug in ggml_compute_forward_add_q_f32 (#1918) --- ggml.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index 78c365354..037f0bc99 100644 --- a/ggml.c +++ b/ggml.c @@ -7918,7 +7918,7 @@ static void ggml_compute_forward_add_q_f32( void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)); float * src1_row = (float *)((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)); - void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb0)); + void * dst_row = (void *) ((char *) dst->data + ( i1*nb1 + i2*nb2 + i3*nb3)); assert(ne00 % 32 == 0); From 0ede372a51fd8160688e01b587582666c14e94e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 18 Jun 2023 16:07:09 +0200 Subject: [PATCH 14/40] Fixed incorrectly applying RMS norm twice (#1925) --- llama.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/llama.cpp b/llama.cpp index dfbb85a68..45360cea3 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1657,11 +1657,7 @@ static bool llama_eval_internal( { cur = ggml_rms_norm(ctx0, inpL); offload_func_nr(cur); - ggml_set_name(cur, "rms_norm_inpL"); - - cur = ggml_rms_norm(ctx0, cur); - offload_func_nr(cur); - ggml_set_name(cur, "rms_norm_after"); + ggml_set_name(cur, "rms_norm_2"); // cur = cur*norm(broadcasted) cur = ggml_mul(ctx0, cur, model.norm); From b24c3049d96557c24782e4d32feaae65f47277af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 18 Jun 2023 17:41:26 +0200 Subject: [PATCH 15/40] Added tokens per second to info prints (#1928) --- llama.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index 45360cea3..2105e3279 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3467,9 +3467,12 @@ void llama_print_timings(struct llama_context * ctx) { fprintf(stderr, "\n"); fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0); - fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample); - fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval); - fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval); + fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample); + fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval); + fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval); fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0); } From 16b9cd193965769089881bb8ec012fccca7b37b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Mon, 19 Jun 2023 10:23:56 +0200 Subject: [PATCH 16/40] Convert vector to f16 for dequantize mul mat vec (#1913) * Convert vector to f16 for dmmv * compile option * Added compilation option description to README * Changed cmake CUDA_ARCHITECTURES from "OFF" to "native" --- CMakeLists.txt | 10 ++- Makefile | 3 + README.md | 9 ++- ggml-cuda.cu | 202 ++++++++++++++++++++++++++++++++++--------------- llama.cpp | 2 +- 5 files changed, 158 insertions(+), 68 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 736771954..dc06365d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,6 +70,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") +option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF) set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_METAL "llama: use Metal" OFF) @@ -238,6 +239,9 @@ if (LLAMA_CUBLAS) add_compile_definitions(GGML_USE_CUBLAS) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) + if (LLAMA_CUDA_DMMV_F16) + add_compile_definitions(GGML_CUDA_DMMV_F16) + endif() add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) if (LLAMA_STATIC) @@ -490,13 +494,13 @@ endif() if (GGML_SOURCES_CUDA) message(STATUS "GGML CUDA sources found, configuring CUDA architecture") - set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF) + set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native") set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES OFF) + set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native") set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF) + set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native") endif() diff --git a/Makefile b/Makefile index afd06e0a6..5dd676fad 100644 --- a/Makefile +++ b/Makefile @@ -169,6 +169,9 @@ ifdef LLAMA_CUDA_DMMV_Y else NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1 endif # LLAMA_CUDA_DMMV_Y +ifdef LLAMA_CUDA_DMMV_F16 + NVCCFLAGS += -DGGML_CUDA_DMMV_F16 +endif # LLAMA_CUDA_DMMV_F16 ifdef LLAMA_CUDA_KQUANTS_ITER NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) else diff --git a/README.md b/README.md index e5b3f59b3..2d05de333 100644 --- a/README.md +++ b/README.md @@ -337,7 +337,14 @@ Building the program with BLAS support may lead to some performance improvements cmake --build . --config Release ``` - The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. + The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: + + | Option | Legal values | Default | Description | + |-------------------------|------------------------|---------|-------------| + | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | + | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. | - #### CLBlast diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 16488b9f9..9ebc57aff 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -50,7 +50,15 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); } while (0) #endif // CUDART_VERSION >= 11 -typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1); +#ifdef GGML_CUDA_DMMV_F16 +typedef half dfloat; // dequantize float +typedef half2 dfloat2; +#else +typedef float dfloat; // dequantize float +typedef float2 dfloat2; +#endif //GGML_CUDA_DMMV_F16 + +typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream); typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v); typedef void (*cpy_kernel_t)(const char * cx, char * cdst); @@ -234,82 +242,106 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol } } -static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){ +static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q4_0 * x = (const block_q4_0 *) vx; - const float d = x[ib].d; + const dfloat d = x[ib].d; - const uint8_t vui = x[ib].qs[iqs]; + const int vui = x[ib].qs[iqs]; - const int8_t vi0 = vui & 0xF; - const int8_t vi1 = vui >> 4; + v.x = vui & 0xF; + v.y = vui >> 4; - v0 = (vi0 - 8)*d; - v1 = (vi1 - 8)*d; +#ifdef GGML_CUDA_DMMV_F16 + v = __hsub2(v, {8.0f, 8.0f}); + v = __hmul2(v, {d, d}); +#else + v.x = (v.x - 8.0f) * d; + v.y = (v.y - 8.0f) * d; +#endif // GGML_CUDA_DMMV_F16 } -static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){ +static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q4_1 * x = (const block_q4_1 *) vx; - const float d = x[ib].d; - const float m = x[ib].m; + const dfloat d = x[ib].d; + const dfloat m = x[ib].m; - const uint8_t vui = x[ib].qs[iqs]; + const int vui = x[ib].qs[iqs]; - const int8_t vi0 = vui & 0xF; - const int8_t vi1 = vui >> 4; + v.x = vui & 0xF; + v.y = vui >> 4; - v0 = vi0*d + m; - v1 = vi1*d + m; +#ifdef GGML_CUDA_DMMV_F16 + v = __hmul2(v, {d, d}); + v = __hadd2(v, {m, m}); +#else + v.x = (v.x * d) + m; + v.y = (v.y * d) + m; +#endif // GGML_CUDA_DMMV_F16 } -static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){ +static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q5_0 * x = (const block_q5_0 *) vx; - const float d = x[ib].d; + const dfloat d = x[ib].d; uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); - const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16; - const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16; + v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y = ((x[ib].qs[iqs] >> 4) | xh_1); - v0 = x0*d; - v1 = x1*d; +#ifdef GGML_CUDA_DMMV_F16 + v = __hsub2(v, {16.0f, 16.0f}); + v = __hmul2(v, {d, d}); +#else + v.x = (v.x - 16.0f) * d; + v.y = (v.y - 16.0f) * d; +#endif // GGML_CUDA_DMMV_F16 } -static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){ +static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q5_1 * x = (const block_q5_1 *) vx; - const float d = x[ib].d; - const float m = x[ib].m; + const dfloat d = x[ib].d; + const dfloat m = x[ib].m; uint32_t qh; memcpy(&qh, x[ib].qh, sizeof(qh)); - const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0); - const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1); + v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y = ((x[ib].qs[iqs] >> 4) | xh_1); - v0 = x0*d + m; - v1 = x1*d + m; +#ifdef GGML_CUDA_DMMV_F16 + v = __hmul2(v, {d, d}); + v = __hadd2(v, {m, m}); +#else + v.x = (v.x * d) + m; + v.y = (v.y * d) + m; +#endif // GGML_CUDA_DMMV_F16 } -static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){ +static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ const block_q8_0 * x = (const block_q8_0 *) vx; - const float d = x[ib].d; + const dfloat d = x[ib].d; - const int8_t vi0 = x[ib].qs[iqs + 0]; - const int8_t vi1 = x[ib].qs[iqs + 1]; + v.x = x[ib].qs[iqs + 0]; + v.y = x[ib].qs[iqs + 1]; - v0 = vi0*d; - v1 = vi1*d; +#ifdef GGML_CUDA_DMMV_F16 + v = __hmul2(v, {d, d}); +#else + v.x *= d; + v.y *= d; +#endif // GGML_CUDA_DMMV_F16 } //================================== k-quants @@ -843,11 +875,12 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float } } -static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){ +static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){ const half * x = (const half *) vx; - v0 = __half2float(x[ib + iqs + 0]); - v1 = __half2float(x[ib + iqs + 1]); + // automatic half -> float type cast if dfloat == float + v.x = x[ib + iqs + 0]; + v.y = x[ib + iqs + 1]; } template @@ -864,13 +897,15 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k) const int y_offset = qr == 1 ? 1 : qk/2; // dequantize - float & v0 = y[iybs + iqs + 0]; - float & v1 = y[iybs + iqs + y_offset]; - dequantize_kernel(vx, ib, iqs, v0, v1); + dfloat2 v; + dequantize_kernel(vx, ib, iqs, v); + + y[iybs + iqs + 0] = v.x; + y[iybs + iqs + y_offset] = v.y; } template -static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols, const int nrows) { +static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) { // qk = quantized weights per x block // qr = number of quantized weights per data value in x block const int row = blockIdx.y*blockDim.y + threadIdx.y; @@ -885,7 +920,12 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter const int y_offset = qr == 1 ? 1 : qk/2; - float tmp = 0.0f; // partial sum for thread in warp +// partial sum for each thread +#ifdef GGML_CUDA_DMMV_F16 + half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics +#else + float tmp = 0.0f; +#endif // GGML_CUDA_DMMV_F16 for (int i = 0; i < ncols; i += iter_stride) { const int col = i + vals_per_iter*tid; @@ -899,14 +939,21 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, // process 2 vals per j iter // dequantize - float v0, v1; - dequantize_kernel(vx, ib, iqs + j/qr, v0, v1); // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j/qr, v); // matrix multiplication - tmp += v0 * y[iybs + iqs + j/qr + 0]; - tmp += v1 * y[iybs + iqs + j/qr + y_offset]; // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 +#ifdef GGML_CUDA_DMMV_F16 + tmp += __hmul2(v, { + y[iybs + iqs + j/qr + 0], + y[iybs + iqs + j/qr + y_offset] + }); +#else + tmp += v.x * y[iybs + iqs + j/qr + 0]; + tmp += v.y * y[iybs + iqs + j/qr + y_offset]; +#endif // GGML_CUDA_DMMV_F16 } } @@ -918,7 +965,11 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, } if (tid == 0) { +#ifdef GGML_CUDA_DMMV_F16 + dst[row] = tmp.x + tmp.y; +#else dst[row] = tmp; +#endif // GGML_CUDA_DMMV_F16 } } @@ -1213,7 +1264,7 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu dequantize_block_q6_K<<>>(vx, y); } -static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y; const dim3 block_nums(1, block_num_y, 1); @@ -1222,7 +1273,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, f <<>>(vx, y, dst, ncols, nrows); } -static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y; const dim3 block_nums(1, block_num_y, 1); @@ -1231,7 +1282,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, f <<>>(vx, y, dst, ncols, nrows); } -static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y; const dim3 block_nums(1, block_num_y, 1); @@ -1240,7 +1291,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, f <<>>(vx, y, dst, ncols, nrows); } -static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y; const dim3 block_nums(1, block_num_y, 1); @@ -1249,7 +1300,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, f <<>>(vx, y, dst, ncols, nrows); } -static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y; const dim3 block_nums(1, block_num_y, 1); @@ -1299,7 +1350,7 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c dequantize_block<1, 1, convert_f16><<>>(vx, y, k); } -static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { +static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y; const dim3 block_nums(1, block_num_y, 1); @@ -1714,21 +1765,40 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( const int64_t ne00 = src0->ne[0]; const int64_t nrows = i01_high - i01_low; +// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics +#ifdef GGML_CUDA_DMMV_F16 + size_t ash; + dfloat * src1_dfloat = nullptr; // dfloat == half + + bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + + if (src1_convert_f16) { + src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash); + ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00, + ne00, 1, sizeof(float), 0, 0, + ne00, 1, sizeof(half), 0, 0, cudaStream_main); + } +#else + dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion +#endif // GGML_CUDA_DMMV_F16 + switch (src0->type) { case GGML_TYPE_Q4_0: - dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q4_1: - dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q5_0: - dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q5_1: - dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q8_0: - dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_Q2_K: dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); @@ -1746,7 +1816,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); break; case GGML_TYPE_F16: - convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main); + convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main); break; default: GGML_ASSERT(false); @@ -1754,6 +1824,12 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec( } CUDA_CHECK(cudaGetLastError()); +#ifdef GGML_CUDA_DMMV_F16 + if (src1_convert_f16) { + ggml_cuda_pool_free(src1_dfloat, ash); + } +#endif // GGML_CUDA_DMMV_F16 + (void) src1; (void) dst; (void) src0_ddf_i; diff --git a/llama.cpp b/llama.cpp index 2105e3279..5401db00e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1620,7 +1620,7 @@ static bool llama_eval_internal( model.layers[il].w1, cur); offload_func(cur); - ggml_set_name(cur, "result_w2"); + ggml_set_name(cur, "result_w1"); // SILU activation cur = ggml_silu(ctx0, cur); From 1e3abfcef073e73c2b31e8570cb06c5cb2fd1f55 Mon Sep 17 00:00:00 2001 From: Howard Su Date: Mon, 19 Jun 2023 23:10:37 +0800 Subject: [PATCH 17/40] cmake : fix build shared ggml when CUDA is enabled (#1929) Co-authored-by: Georgi Gerganov --- CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index dc06365d1..a598593b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -469,6 +469,7 @@ add_library(ggml_static STATIC $) if (BUILD_SHARED_LIBS) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) add_library(ggml_shared SHARED $) + target_link_libraries(ggml_shared PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) endif() add_library(llama @@ -500,6 +501,11 @@ if (GGML_SOURCES_CUDA) set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native") set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") + if (BUILD_SHARED_LIBS) + set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native") + set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") + endif() + set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native") endif() From b97ca431db35ec96a339a721acb1219c1dd78bed Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 19 Jun 2023 18:12:33 +0300 Subject: [PATCH 18/40] ggml : sync latest ggml repo (#1924) * ggml : sync latest ggml repo * ggml : remove unused comments * ggml : asserts --- ggml.c | 801 ++++++++++++++++++++++++++++++++++++++++++++++++++------- ggml.h | 144 ++++++++++- 2 files changed, 844 insertions(+), 101 deletions(-) diff --git a/ggml.c b/ggml.c index 037f0bc99..14e08f9d6 100644 --- a/ggml.c +++ b/ggml.c @@ -112,6 +112,7 @@ typedef void* thread_ret_t; /*#define GGML_PERF*/ #define GGML_DEBUG 0 #define GGML_GELU_FP16 +#define GGML_GELU_QUICK_FP16 #define GGML_SILU_FP16 #define GGML_SOFT_MAX_UNROLL 4 @@ -340,6 +341,9 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { // precomputed gelu table for f16 (128 KB) static ggml_fp16_t table_gelu_f16[1 << 16]; +// precomputed quick gelu table for f16 (128 KB) +static ggml_fp16_t table_gelu_quick_f16[1 << 16]; + // precomputed silu table for f16 (128 KB) static ggml_fp16_t table_silu_f16[1 << 16]; @@ -1677,14 +1681,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x) #define GGML_F32x4_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ - x[2*i] = vaddq_f32(x[2*i], x[2*i+1]); \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f32(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ - x[4*i] = vaddq_f32(x[4*i], x[4*i+2]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f32(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ - x[8*i] = vaddq_f32(x[8*i], x[8*i+4]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f32(x[i], x[offset+i]); \ } \ res = GGML_F32x4_REDUCE_ONE(x[0]); \ } @@ -1715,14 +1722,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { #define GGML_F16x8_MUL vmulq_f16 #define GGML_F16x8_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F16_ARR/2; ++i) { \ - x[2*i] = vaddq_f16(x[2*i], x[2*i+1]); \ + int offset = GGML_F16_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f16(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F16_ARR/4; ++i) { \ - x[4*i] = vaddq_f16(x[4*i], x[4*i+2]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f16(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F16_ARR/8; ++i) { \ - x[8*i] = vaddq_f16(x[8*i], x[8*i+4]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vaddq_f16(x[i], x[offset+i]); \ } \ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ @@ -1789,14 +1799,17 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) { #define GGML_F32x8_MUL _mm256_mul_ps #define GGML_F32x8_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ - x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]); \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm256_add_ps(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ - x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm256_add_ps(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ - x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm256_add_ps(x[i], x[offset+i]); \ } \ const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \ _mm256_extractf128_ps(x[0], 1)); \ @@ -1886,14 +1899,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { #define GGML_F32x4_MUL vec_mul #define GGML_F32x4_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ - x[2*i] = vec_add(x[2*i], x[2*i+1]); \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vec_add(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ - x[4*i] = vec_add(x[4*i], x[4*i+2]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vec_add(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ - x[8*i] = vec_add(x[8*i], x[8*i+4]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = vec_add(x[i], x[offset+i]); \ } \ res = vec_extract(x[0], 0) + \ vec_extract(x[0], 1) + \ @@ -1949,14 +1965,17 @@ static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { #define GGML_F32x4_MUL wasm_f32x4_mul #define GGML_F32x4_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ - x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ - x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ - x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ } \ res = wasm_f32x4_extract_lane(x[0], 0) + \ wasm_f32x4_extract_lane(x[0], 1) + \ @@ -2011,14 +2030,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { #define GGML_F16x4_MUL wasm_f32x4_mul #define GGML_F16x4_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F16_ARR/2; ++i) { \ - x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \ + int offset = GGML_F16_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F16_ARR/4; ++i) { \ - x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F16_ARR/8; ++i) { \ - x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = wasm_f32x4_add(x[i], x[offset+i]); \ } \ res = wasm_f32x4_extract_lane(x[0], 0) + \ wasm_f32x4_extract_lane(x[0], 1) + \ @@ -2060,14 +2082,17 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { #define GGML_F32x4_MUL _mm_mul_ps #define GGML_F32x4_REDUCE(res, x) \ { \ - for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ - x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]); \ + int offset = GGML_F32_ARR >> 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm_add_ps(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F32_ARR/4; ++i) { \ - x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm_add_ps(x[i], x[offset+i]); \ } \ - for (int i = 0; i < GGML_F32_ARR/8; ++i) { \ - x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]); \ + offset >>= 1; \ + for (int i = 0; i < offset; ++i) { \ + x[i] = _mm_add_ps(x[i], x[offset+i]); \ } \ const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \ res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \ @@ -3356,6 +3381,7 @@ inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } static const float GELU_COEF_A = 0.044715f; +static const float GELU_QUICK_COEF = -1.702f; static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; inline static float ggml_gelu_f32(float x) { @@ -3386,6 +3412,34 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { } #endif +inline static float ggml_gelu_quick_f32(float x) { + return x*(1.0f/(1.0f+expf(GELU_QUICK_COEF*x))); +} + +//inline static void ggml_vec_gelu_quick_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { +// const uint16_t * i16 = (const uint16_t *) x; +// for (int i = 0; i < n; ++i) { +// y[i] = table_gelu_quick_f16[i16[i]]; +// } +//} + +#ifdef GGML_GELU_QUICK_FP16 +inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { + uint16_t t; + for (int i = 0; i < n; ++i) { + ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]); + memcpy(&t, &fp16, sizeof(uint16_t)); + y[i] = GGML_FP16_TO_FP32(table_gelu_quick_f16[t]); + } +} +#else +inline static void ggml_vec_gelu_quick_f32(const int n, float * y, const float * x) { + for (int i = 0; i < n; ++i) { + y[i] = ggml_gelu_quick_f32(x[i]); + } +} +#endif + // Sigmoid Linear Unit (SiLU) function inline static float ggml_silu_f32(float x) { return x/(1.0f + expf(-x)); @@ -3616,6 +3670,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "STEP", "RELU", "GELU", + "GELU_QUICK", "SILU", "SILU_BACK", "NORM", @@ -3644,12 +3699,15 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "ROPE_BACK", "ALIBI", "CLAMP", - "CONV_1D_1S", - "CONV_1D_2S", + "CONV_1D_S1_PH", + "CONV_1D_S2_PH", + "CONV_2D_SK_P0", "FLASH_ATTN", "FLASH_FF", "FLASH_ATTN_BACK", + "WIN_PART", + "WIN_UNPART", "MAP_UNARY", "MAP_BINARY", @@ -3658,7 +3716,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57"); +static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -3684,6 +3742,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "step(x)", "relu(x)", "gelu(x)", + "gelu_quick(x)", "silu(x)", "silu_back(x)", "norm(x)", @@ -3712,12 +3771,15 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "rope_back(x)", "alibi(x)", "clamp(x)", - "conv_1d_1s(x)", - "conv_1d_2s(x)", + "conv_1d_s1_ph(x)", + "conv_1d_s2_ph(x)", + "conv_2d_sk_p0(x)", "flash_attn(x)", "flash_ff(x)", "flash_attn_back(x)", + "win_part(x)", + "win_unpart(x)", "f(x)", "f(x,y)", @@ -3726,7 +3788,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 57, "GGML_OP_COUNT != 57"); +static_assert(GGML_OP_COUNT == 61, "GGML_OP_COUNT != 61"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); @@ -4017,7 +4079,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { // initialize time system (required on Windows) ggml_time_init(); - // initialize GELU, SILU and EXP F32 tables + // initialize GELU, Quick GELU, SILU and EXP F32 tables { const uint64_t t_start = ggml_time_us(); UNUSED(t_start); @@ -4027,13 +4089,14 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { memcpy(&ii, &ui, sizeof(ii)); const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii); table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); + table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f)); table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f)); } const uint64_t t_end = ggml_time_us(); UNUSED(t_end); - GGML_PRINT_DEBUG("%s: GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); + GGML_PRINT_DEBUG("%s: GELU, Quick GELU, SILU and EXP tables initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } // initialize g_state @@ -4665,9 +4728,10 @@ const char * ggml_get_name(const struct ggml_tensor * tensor) { return tensor->name; } -void ggml_set_name(struct ggml_tensor * tensor, const char * name) { +struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name) { strncpy(tensor->name, name, sizeof(tensor->name)); tensor->name[sizeof(tensor->name) - 1] = '\0'; + return tensor; } struct ggml_tensor * ggml_view_tensor( @@ -5446,6 +5510,40 @@ struct ggml_tensor * ggml_gelu_inplace( return ggml_gelu_impl(ctx, a, true); } +// ggml_gelu_quick + +struct ggml_tensor * ggml_gelu_quick_impl( + struct ggml_context * ctx, + struct ggml_tensor * a, + bool inplace) { + bool is_node = false; + + if (!inplace && (a->grad)) { + is_node = true; + } + + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + + result->op = GGML_OP_GELU_QUICK; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + + return result; +} + +struct ggml_tensor * ggml_gelu_quick( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_gelu_quick_impl(ctx, a, false); +} + +struct ggml_tensor * ggml_gelu_quick_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_gelu_quick_impl(ctx, a, true); +} + // ggml_silu struct ggml_tensor * ggml_silu_impl( @@ -6645,7 +6743,7 @@ struct ggml_tensor * ggml_clamp( ggml_scratch_save(ctx); - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2); ((float *) b->data)[0] = min; ((float *) b->data)[1] = max; @@ -6660,9 +6758,9 @@ struct ggml_tensor * ggml_clamp( return result; } -// ggml_conv_1d_1s +// ggml_conv_1d_s1_ph -struct ggml_tensor * ggml_conv_1d_1s( +struct ggml_tensor * ggml_conv_1d_s1_ph( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { @@ -6679,7 +6777,7 @@ struct ggml_tensor * ggml_conv_1d_1s( const int64_t ne[4] = { b->ne[0], a->ne[2], 1, 1, }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); - result->op = GGML_OP_CONV_1D_1S; + result->op = GGML_OP_CONV_1D_S1_PH; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; @@ -6687,9 +6785,9 @@ struct ggml_tensor * ggml_conv_1d_1s( return result; } -// ggml_conv_1d_2s +// ggml_conv_1d_s2_ph -struct ggml_tensor * ggml_conv_1d_2s( +struct ggml_tensor * ggml_conv_1d_s2_ph( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { @@ -6706,7 +6804,35 @@ struct ggml_tensor * ggml_conv_1d_2s( const int64_t ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); - result->op = GGML_OP_CONV_1D_2S; + result->op = GGML_OP_CONV_1D_S2_PH; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = b; + + return result; +} + +// ggml_conv_2d_sk_p0 + +struct ggml_tensor * ggml_conv_2d_sk_p0( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + GGML_ASSERT(b->ne[3] == 1); + GGML_ASSERT(a->ne[2] == b->ne[2]); + GGML_ASSERT(b->ne[0] % a->ne[0] == 0); + GGML_ASSERT(b->ne[1] % a->ne[1] == 0); + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { b->ne[0]/a->ne[0], b->ne[1]/a->ne[1], a->ne[3], 1, }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_CONV_2D_SK_P0; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src0 = a; result->src1 = b; @@ -6840,6 +6966,89 @@ struct ggml_tensor * ggml_flash_attn_back( return result; } +// ggml_win_part + +struct ggml_tensor * ggml_win_part( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w) { + GGML_ASSERT(a->ne[3] == 1); + GGML_ASSERT(a->type == GGML_TYPE_F32); + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + // padding + const int px = (w - a->ne[1]%w)%w; + const int py = (w - a->ne[2]%w)%w; + + const int npx = (px + a->ne[1])/w; + const int npy = (py + a->ne[2])/w; + const int np = npx*npy; + + const int64_t ne[4] = { a->ne[0], w, w, np, }; + + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + ggml_scratch_save(ctx); + + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); + + ((int32_t *) b->data)[0] = npx; + ((int32_t *) b->data)[1] = npy; + ((int32_t *) b->data)[2] = w; + + ggml_scratch_load(ctx); + + result->op = GGML_OP_WIN_PART; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + result->opt[0] = b; + + return result; +} + +// ggml_win_unpart + +struct ggml_tensor * ggml_win_unpart( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w0, + int h0, + int w) { + GGML_ASSERT(a->type == GGML_TYPE_F32); + + bool is_node = false; + + if (a->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { a->ne[0], w0, h0, 1, }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); + + ggml_scratch_save(ctx); + + struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); + + ((int32_t *) b->data)[0] = w; + + ggml_scratch_load(ctx); + + result->op = GGML_OP_WIN_UNPART; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src0 = a; + result->src1 = NULL; + result->opt[0] = b; + + return result; +} // ggml_map_unary @@ -9479,8 +9688,65 @@ static void ggml_compute_forward_gelu( GGML_ASSERT(false); } break; } +} - //printf("XXXXXXXX gelu\n"); +// ggml_compute_forward_gelu_quick + +static void ggml_compute_forward_gelu_quick_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(dst)); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nc = src0->ne[0]; + const int nr = ggml_nrows(src0); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int i1 = ir0; i1 < ir1; i1++) { + ggml_vec_gelu_quick_f32(nc, + (float *) ((char *) dst->data + i1*( dst->nb[1])), + (float *) ((char *) src0->data + i1*(src0->nb[1]))); + +#ifndef NDEBUG + for (int k = 0; k < nc; k++) { + const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + UNUSED(x); + assert(!isnan(x)); + assert(!isinf(x)); + } +#endif + } +} + +static void ggml_compute_forward_gelu_quick( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_gelu_quick_f32(params, src0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } } // ggml_compute_forward_silu @@ -10878,7 +11144,7 @@ static void ggml_compute_forward_set_f32( const int im2 = (ne12 == 0 ? 0 : ne12-1); const int im3 = (ne13 == 0 ? 0 : ne13-1); - GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 < ggml_nbytes(dst)); + GGML_ASSERT(offset + im0*nb0 + im1*nb1 + im2*nb2 + im3*nb3 <= ggml_nbytes(dst)); GGML_ASSERT(nb10 == sizeof(float)); @@ -11599,8 +11865,9 @@ static void ggml_compute_forward_alibi_f32( const struct ggml_tensor * src1, struct ggml_tensor * dst) { assert(params->ith == 0); - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); + + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(src1) == 3); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -11663,8 +11930,9 @@ static void ggml_compute_forward_alibi_f16( const struct ggml_tensor * src1, struct ggml_tensor * dst) { assert(params->ith == 0); - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); + + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(src1) == 3); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -11766,15 +12034,16 @@ static void ggml_compute_forward_clamp_f32( const struct ggml_tensor * src1, struct ggml_tensor * dst) { assert(params->ith == 0); - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_nelements(src1) == 2); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int min = ((float *) src1->data)[0]; - const int max = ((float *) src1->data)[1]; + const float min = ((float *) src1->data)[0]; + const float max = ((float *) src1->data)[1]; const int ith = params->ith; const int nth = params->nth; @@ -12332,9 +12601,9 @@ static void ggml_compute_forward_rope_back( } } -// ggml_compute_forward_conv_1d_1s +// ggml_compute_forward_conv_1d_s1_ph -static void ggml_compute_forward_conv_1d_1s_f16_f32( +static void ggml_compute_forward_conv_1d_s1_ph_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -12454,7 +12723,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32( } } -static void ggml_compute_forward_conv_1d_1s_f32( +static void ggml_compute_forward_conv_1d_s1_ph_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -12574,7 +12843,7 @@ static void ggml_compute_forward_conv_1d_1s_f32( } } -static void ggml_compute_forward_conv_1d_1s( +static void ggml_compute_forward_conv_1d_s1_ph( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -12582,11 +12851,11 @@ static void ggml_compute_forward_conv_1d_1s( switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s1_ph_f16_f32(params, src0, src1, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s1_ph_f32(params, src0, src1, dst); } break; default: { @@ -12595,9 +12864,9 @@ static void ggml_compute_forward_conv_1d_1s( } } -// ggml_compute_forward_conv_1d_2s +// ggml_compute_forward_conv_1d_s2_ph -static void ggml_compute_forward_conv_1d_2s_f16_f32( +static void ggml_compute_forward_conv_1d_s2_ph_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -12717,7 +12986,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32( } } -static void ggml_compute_forward_conv_1d_2s_f32( +static void ggml_compute_forward_conv_1d_s2_ph_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -12837,7 +13106,7 @@ static void ggml_compute_forward_conv_1d_2s_f32( } } -static void ggml_compute_forward_conv_1d_2s( +static void ggml_compute_forward_conv_1d_s2_ph( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, @@ -12845,11 +13114,148 @@ static void ggml_compute_forward_conv_1d_2s( switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s2_ph_f16_f32(params, src0, src1, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst); + ggml_compute_forward_conv_1d_s2_ph_f32(params, src0, src1, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_conv_2d_sk_p0 + +static void ggml_compute_forward_conv_2d_sk_p0_f16_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + //const int ne03 = src0->ne[3]; + + const int ne10 = src1->ne[0]; + //const int ne11 = src1->ne[1]; + const int ne12 = src1->ne[2]; + //const int ne13 = src1->ne[3]; + + const int ne0 = dst->ne[0]; + const int ne1 = dst->ne[1]; + const int ne2 = dst->ne[2]; + //const int ne3 = dst->ne[3]; + //const int ne = ne0*ne1*ne2*ne3; + + const int nb00 = src0->nb[0]; + //const int nb01 = src0->nb[1]; + //const int nb02 = src0->nb[2]; + const int nb03 = src0->nb[3]; + + const int nb10 = src1->nb[0]; + //const int nb11 = src1->nb[1]; + const int nb12 = src1->nb[2]; + //const int nb13 = src1->nb[3]; + + //const int nb0 = dst->nb[0]; + //const int nb1 = dst->nb[1]; + const int nb2 = dst->nb[2]; + //const int nb3 = dst->nb[3]; + + const int ith = params->ith; + const int nth = params->nth; + + const int nk0 = ne00; + const int nk1 = ne01; + + // size of the convolution row - the kernel size unrolled across all channels + // round-up so it is more suitable for SIMD + const int ew0 = ggml_up32(nk0*nk1*ne02); + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + // TODO: fix this memset (wsize is overestimated) + memset(params->wdata, 0, params->wsize); + + // prepare source data (src1) + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int i12 = 0; i12 < ne12; i12++) { + const float * const src = (float *)((char *) src1->data + i12*nb12); + ggml_fp16_t * dst_data = wdata; + + for (int i1 = 0; i1 < ne1; i1++) { + for (int i0 = 0; i0 < ne0; i0++) { + for (int ik1 = 0; ik1 < nk1; ik1++) { + for (int ik0 = 0; ik0 < nk0; ik0++) { + dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = + GGML_FP32_TO_FP16(src[(i1*nk1 + ik1)*ne10 + (i0*nk0 + ik0)]); + } + } + } + } + } + } + + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // total patches in dst + const int np = ne2; + + // patches per thread + const int dp = (np + nth - 1)/nth; + + // patch range for this thread + const int ip0 = dp*ith; + const int ip1 = MIN(ip0 + dp, np); + + ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + + for (int i2 = ip0; i2 < ip1; i2++) { + float * dst_data = (float *)((char *) dst->data + i2*nb2); + + for (int i1 = 0; i1 < ne1; ++i1) { + for (int i0 = 0; i0 < ne0; ++i0) { + ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, + (ggml_fp16_t *) ((char *) src0->data + i2*nb03), + (ggml_fp16_t *) wdata + (i1*ne0 + i0)*ew0); + } + } + } +} + +static void ggml_compute_forward_conv_2d_sk_p0( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_2d_sk_p0_f16_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + //ggml_compute_forward_conv_2d_sk_p0_f32(params, src0, src1, dst); + GGML_ASSERT(false); } break; default: { @@ -13952,6 +14358,145 @@ static void ggml_compute_forward_flash_attn_back( } } +// ggml_compute_forward_win_part + +static void ggml_compute_forward_win_part_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int64_t ne00 = src0->ne[0]; UNUSED(ne00); + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; UNUSED(ne03); + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; UNUSED(ne3); + + const int32_t nep0 = ((const int32_t *)(opt0->data))[0]; + const int32_t nep1 = ((const int32_t *)(opt0->data))[1]; + const int32_t w = ((const int32_t *)(opt0->data))[2]; + + assert(ne00 == ne0); + assert(ne3 == nep0*nep1); + + // TODO: optimize / multi-thread + for (int py = 0; py < nep1; ++py) { + for (int px = 0; px < nep0; ++px) { + const int64_t i3 = py*nep0 + px; + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = 0; i1 < ne1; ++i1) { + for (int64_t i0 = 0; i0 < ne0; ++i0) { + const int64_t i02 = py*w + i2; + const int64_t i01 = px*w + i1; + const int64_t i00 = i0; + + const int64_t i = i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + i0; + const int64_t j = i02*ne01*ne00 + i01*ne00 + i00; + + if (py*w + i2 >= ne02 || px*w + i1 >= ne01) { + ((float *) dst->data)[i] = 0.0f; + } else { + ((float *) dst->data)[i] = ((float *) src0->data)[j]; + } + } + } + } + } + } +} + +static void ggml_compute_forward_win_part( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_win_part_f32(params, src0, opt0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +// ggml_compute_forward_win_unpart + +static void ggml_compute_forward_win_unpart_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { + return; + } + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + //const int64_t ne03 = src0->ne[3]; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + + const int32_t w = ((const int32_t *)(opt0->data))[0]; + + // padding + const int px = (w - ne1%w)%w; + //const int py = (w - ne2%w)%w; + + const int npx = (px + ne1)/w; + //const int npy = (py + ne2)/w; + + assert(ne0 == ne00); + + // TODO: optimize / multi-thread + for (int64_t i2 = 0; i2 < ne2; ++i2) { + for (int64_t i1 = 0; i1 < ne1; ++i1) { + for (int64_t i0 = 0; i0 < ne0; ++i0) { + const int ip2 = i2/w; + const int ip1 = i1/w; + + const int64_t i02 = i2%w; + const int64_t i01 = i1%w; + const int64_t i00 = i0; + + const int64_t i = (ip2*npx + ip1)*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00 + i00; + const int64_t j = i2*ne1*ne0 + i1*ne0 + i0; + + ((float *) dst->data)[j] = ((float *) src0->data)[i]; + } + } + } +} + +static void ggml_compute_forward_win_unpart( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * opt0, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F32: + { + ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_map_unary static void ggml_compute_forward_map_unary_f32( @@ -14424,6 +14969,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_gelu(params, tensor->src0, tensor); } break; + case GGML_OP_GELU_QUICK: + { + ggml_compute_forward_gelu_quick(params, tensor->src0, tensor); + } break; case GGML_OP_SILU: { ggml_compute_forward_silu(params, tensor->src0, tensor); @@ -14528,19 +15077,23 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_clamp(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_CONV_1D_1S: + case GGML_OP_CONV_1D_S1_PH: { - ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_conv_1d_s1_ph(params, tensor->src0, tensor->src1, tensor); } break; - case GGML_OP_CONV_1D_2S: + case GGML_OP_CONV_1D_S2_PH: { - ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor); + ggml_compute_forward_conv_1d_s2_ph(params, tensor->src0, tensor->src1, tensor); + } break; + case GGML_OP_CONV_2D_SK_P0: + { + ggml_compute_forward_conv_2d_sk_p0(params, tensor->src0, tensor->src1, tensor); } break; case GGML_OP_FLASH_ATTN: { - int32_t t = ggml_get_i32_1d(tensor->opt[1], 0); + const int32_t t = ggml_get_i32_1d(tensor->opt[1], 0); GGML_ASSERT(t == 0 || t == 1); - bool masked = t != 0; + const bool masked = t != 0; ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor); } break; case GGML_OP_FLASH_FF: @@ -14554,6 +15107,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm bool masked = t != 0; ggml_compute_forward_flash_attn_back(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], masked, tensor); } break; + case GGML_OP_WIN_PART: + { + ggml_compute_forward_win_part(params, tensor->src0, tensor->opt[0], tensor); + } break; + case GGML_OP_WIN_UNPART: + { + ggml_compute_forward_win_unpart(params, tensor->src0, tensor->opt[0], tensor); + } break; case GGML_OP_MAP_UNARY: { const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->opt[0]->data); @@ -14825,6 +15386,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_GELU_QUICK: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_ALIBI: { GGML_ASSERT(false); // TODO: not implemented @@ -15187,11 +15752,15 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor // noop } } break; - case GGML_OP_CONV_1D_1S: + case GGML_OP_CONV_1D_S1_PH: { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_CONV_1D_2S: + case GGML_OP_CONV_1D_S2_PH: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_2D_SK_P0: { GGML_ASSERT(false); // TODO: not implemented } break; @@ -15360,6 +15929,8 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // not supported } break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: { @@ -15768,6 +16339,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) } break; case GGML_OP_MUL: case GGML_OP_GELU: + case GGML_OP_GELU_QUICK: case GGML_OP_SILU: case GGML_OP_SILU_BACK: case GGML_OP_NORM: @@ -15874,8 +16446,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) { node->n_tasks = 1; //TODO } break; - case GGML_OP_CONV_1D_1S: - case GGML_OP_CONV_1D_2S: + case GGML_OP_CONV_1D_S1_PH: + case GGML_OP_CONV_1D_S2_PH: { node->n_tasks = n_threads; @@ -15902,6 +16474,41 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) GGML_ASSERT(false); } + work_size = MAX(work_size, cur); + } break; + case GGML_OP_CONV_2D_SK_P0: + { + node->n_tasks = n_threads; + + GGML_ASSERT(node->src1->ne[3] == 1); + + const int64_t ne00 = node->src0->ne[0]; // W + const int64_t ne01 = node->src0->ne[1]; // H + const int64_t ne02 = node->src0->ne[2]; // C + const int64_t ne03 = node->src0->ne[3]; // N + + const int64_t ne10 = node->src1->ne[0]; // W + const int64_t ne11 = node->src1->ne[1]; // H + const int64_t ne12 = node->src1->ne[2]; // C + + const int64_t nk = ne00*ne01; + + UNUSED(ne02); + UNUSED(ne03); + UNUSED(nk); + + size_t cur = 0; + + if (node->src0->type == GGML_TYPE_F16 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(ggml_fp16_t)*(ne10*ne11*ne12); + } else if (node->src0->type == GGML_TYPE_F32 && + node->src1->type == GGML_TYPE_F32) { + cur = sizeof(float)* (ne10*ne11*ne12); + } else { + GGML_ASSERT(false); + } + work_size = MAX(work_size, cur); } break; case GGML_OP_FLASH_ATTN: @@ -15963,6 +16570,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) work_size = MAX(work_size, cur); } break; + case GGML_OP_WIN_PART: + case GGML_OP_WIN_UNPART: case GGML_OP_MAP_UNARY: case GGML_OP_MAP_BINARY: { @@ -16495,16 +17104,20 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** if (!*ctx_data) { fprintf(stderr, "%s: failed to create ggml context\n", __func__); + fclose(fin); return result; } } data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize); - const size_t ret = fread(data->data, sizeof(char), fsize, fin); - if (ret != fsize) { - fprintf(stderr, "%s: failed to read %s\n", __func__, fname); - return result; + { + const size_t ret = fread(data->data, sizeof(char), fsize, fin); + if (ret != fsize) { + fprintf(stderr, "%s: failed to read %s\n", __func__, fname); + fclose(fin); + return result; + } } fclose(fin); diff --git a/ggml.h b/ggml.h index 1380c530f..18c78551f 100644 --- a/ggml.h +++ b/ggml.h @@ -303,6 +303,7 @@ extern "C" { GGML_OP_STEP, GGML_OP_RELU, GGML_OP_GELU, + GGML_OP_GELU_QUICK, GGML_OP_SILU, GGML_OP_SILU_BACK, GGML_OP_NORM, // normalize @@ -331,12 +332,15 @@ extern "C" { GGML_OP_ROPE_BACK, GGML_OP_ALIBI, GGML_OP_CLAMP, - GGML_OP_CONV_1D_1S, - GGML_OP_CONV_1D_2S, + GGML_OP_CONV_1D_S1_PH, + GGML_OP_CONV_1D_S2_PH, + GGML_OP_CONV_2D_SK_P0, GGML_OP_FLASH_ATTN, GGML_OP_FLASH_FF, GGML_OP_FLASH_ATTN_BACK, + GGML_OP_WIN_PART, + GGML_OP_WIN_UNPART, GGML_OP_MAP_UNARY, GGML_OP_MAP_BINARY, @@ -557,8 +561,8 @@ extern "C" { GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); - GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor); - GGML_API void ggml_set_name(struct ggml_tensor * tensor, const char * name); + GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor); + GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name); // // operations on tensors with backpropagation @@ -611,24 +615,47 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_sub_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_mul( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_mul_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_div( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_div_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + GGML_API struct ggml_tensor * ggml_sqr( struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_sqr_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_sqrt( struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_sqrt_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_log( struct ggml_context * ctx, struct ggml_tensor * a); @@ -668,31 +695,67 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_abs_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_sgn( struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_sgn_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_neg( struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_neg_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_step( struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_step_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_relu( struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_relu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + // TODO: double-check this computation is correct GGML_API struct ggml_tensor * ggml_gelu( struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_gelu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_quick( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_gelu_quick_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_silu( struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_silu_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + // a - x // b - dy GGML_API struct ggml_tensor * ggml_silu_back( @@ -706,10 +769,18 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_rms_norm( struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_rms_norm_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + // a - x // b - dy GGML_API struct ggml_tensor * ggml_rms_norm_back( @@ -999,16 +1070,55 @@ extern "C" { float min, float max); - // padding = 1 + // TODO: implement general-purpose convolutions + // GGML_API struct ggml_tensor * ggml_conv_1d( + // struct ggml_context * ctx, + // struct ggml_tensor * a, + // struct ggml_tensor * b, + // int s0 + // int p0, + // int d0); + // + // GGML_API struct ggml_tensor * ggml_conv_2d( + // struct ggml_context * ctx, + // struct ggml_tensor * a, + // struct ggml_tensor * b, + // int s0, + // int s1, + // int p0, + // int p1, + // int d0, + // int d1); + + // padding = half // TODO: we don't support extra parameters for now // that's why we are hard-coding the stride, padding, and dilation // not great .. - GGML_API struct ggml_tensor * ggml_conv_1d_1s( + // example: + // a: 3 80 768 1 + // b: 3000 80 1 1 + // res: 3000 768 1 1 + // used in whisper + GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b); - GGML_API struct ggml_tensor * ggml_conv_1d_2s( + // used in whisper + GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b); + + // kernel size is a->ne[0] x a->ne[1] + // stride is equal to kernel size + // padding is zero + // example: + // a: 16 16 3 768 + // b: 1024 1024 3 1 + // res: 64 64 768 1 + // used in sam + GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b); @@ -1036,6 +1146,26 @@ extern "C" { struct ggml_tensor * c0, struct ggml_tensor * c1); + // partition into non-overlapping windows with padding if needed + // example: + // a: 768 64 64 1 + // w: 14 + // res: 768 14 14 25 + // used in sam + GGML_API struct ggml_tensor * ggml_win_part( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w); + + // reverse of ggml_win_part + // used in sam + GGML_API struct ggml_tensor * ggml_win_unpart( + struct ggml_context * ctx, + struct ggml_tensor * a, + int w0, + int h0, + int w); + // Mapping operations typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); From ca7c3f4da5d144d4cd1dd44903552e6ba49b8ec8 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Mon, 19 Jun 2023 18:14:09 +0300 Subject: [PATCH 19/40] cuda : faster k-quants on older GPUs (#1930) * k_quants: hopefully much faster Q4_K on older GPUs On the GTX-1660 that I have available to represent "old GPUs", token prediction drops from 65.5 ms/tok to 41.5 ms/tok! * k_quants: hopefully much faster Q3_K on older GPUs On the GTX-1660 that I have available to represent "old GPUs", token prediction drops from 60.3 ms/tok to 41.0 ms/tok! * k_quants: faster Q2_K on older GPUs It looks like I didn't need to change anything compared to what we already had, so this is just adding clarifying comments. But I now measure 36.3 ms/tok on the GTX-1660, instead fo the 47.2 ms/tok that I have written in the faster k-quants PR. * k_quants: faster Q5_K on older GPUs 68.5 ms/tok -> 62.0 ms/tok on GTX-1660. For some reason the same access pattern that leads to such resounding success for Q2_K to Q4_K did not work at all for Q5_K. It is also more difficult to measure because for Q5_K_S we only have 32 layers on the GTX-1660, so output, tok embeddings and kv cache are done on the CPU. --------- Co-authored-by: Iwan Kawrakow --- ggml-cuda.cu | 83 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 33 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 9ebc57aff..36a251ecc 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -515,15 +515,15 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float const block_q2_K * x = (const block_q2_K *)vx + ib0; - const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 - const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...7 + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...14 in steps of 4 + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 const int q_offset = 32*im + l0; const int s_offset = 8*im; const int y_offset = 128*im + l0; @@ -578,27 +578,30 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float } } -static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols) { +static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { const uint16_t kmask1 = 0x0303; const uint16_t kmask2 = 0x0f0f; - const int row = blockIdx.x; + const int row = blockIdx.y*blockDim.y + threadIdx.y; + if (row > nrows) return; + const int num_blocks_per_row = ncols / QK_K; const int ib0 = row*num_blocks_per_row; const block_q3_K * x = (const block_q3_K *)vx + ib0; - const int tid = threadIdx.x/2; // 0...15 - const int ix = threadIdx.x%2; // 0, 1 + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 - const int n = 2; // iterations in the inner loop - const int im = tid/8; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - 8*im; // 0...7 + const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 const uint8_t m = 1 << (4*im); - const int l0 = n*in; // 0...28 in steps of 4 + const int l0 = n*in; // 0...15 or 0...14 in steps of 2 const int q_offset = 32*im + l0; const int y_offset = 128*im + l0; @@ -609,7 +612,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float float tmp = 0; // partial sum for thread in warp - for (int i = ix; i < num_blocks_per_row; i += 2) { + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { const float * y = yy + i * QK_K + y_offset; const uint8_t * q = x[i].qs + q_offset; @@ -650,22 +653,25 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float } } -static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols) { +static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) { const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; const uint16_t kmask3 = 0xc0c0; - const int row = blockIdx.x; + const int row = blockIdx.y*blockDim.y + threadIdx.y; + if (row > nrows) return; const int num_blocks_per_row = ncols / QK_K; const int ib0 = row*num_blocks_per_row; - const int tid = threadIdx.x/2; // 0...15 - const int ix = threadIdx.x%2; + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 - const int il = tid/4; // 0...3 - const int ir = tid - 4*il;// 0...3 - const int n = 4; + const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + + const int il = tid/step; // 0...3 + const int ir = tid - step*il; // 0...7 or 0...3 + const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 const int in = il%2; @@ -681,7 +687,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float float tmp = 0; // partial sum for thread in warp - for (int i = ix; i < num_blocks_per_row; i += 2) { + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { const uint8_t * q1 = x[i].qs + q_offset; const uint8_t * q2 = q1 + 64; @@ -736,7 +742,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float const int il = tid/4; // 0...3 const int ir = tid - 4*il;// 0...3 - const int n = 4; + const int n = 2; const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 const int in = il%2; @@ -775,11 +781,16 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float float4 sum = {0.f, 0.f, 0.f, 0.f}; float smin = 0; for (int l = 0; l < n; ++l) { - sum.x += y1[l+ 0] * ((ql1[l] & 0xF) + (qh[l] & (hm1 << 0) ? 16 : 0)); - sum.y += y1[l+32] * ((ql1[l] >> 4) + (qh[l] & (hm1 << 1) ? 16 : 0)); - sum.z += y2[l+ 0] * ((ql2[l] & 0xF) + (qh[l] & (hm2 << 0) ? 16 : 0)); - sum.w += y2[l+32] * ((ql2[l] >> 4) + (qh[l] & (hm2 << 1) ? 16 : 0)); - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0)) + + y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0)); + sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0)) + + y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0)); + sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0)) + + y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0)); + sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0)) + + y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; } tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; @@ -1311,7 +1322,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; + const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 const int block_num_y = (nrows + ny - 1) / ny; const dim3 block_nums(1, block_num_y, 1); const dim3 block_dims(32, ny, 1); @@ -1320,14 +1331,20 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const dim3 block_dims(32, 1, 1); - dequantize_mul_mat_vec_q3_k<<>>(vx, y, dst, ncols); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_q3_k<<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { GGML_ASSERT(ncols % QK_K == 0); - const dim3 block_dims(32, 1, 1); - dequantize_mul_mat_vec_q4_k<<>>(vx, y, dst, ncols); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(1, block_num_y, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_q4_k<<>>(vx, y, dst, ncols, nrows); } static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { From cb40dfca694b5cb849837548fd69932117c78362 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Mon, 19 Jun 2023 18:17:03 +0300 Subject: [PATCH 20/40] llama : only use Q6_K for output weights if tensor size is multiple of 256 (#1932) * Only use Q6_K for output weights if tensor size is multiple of 256 * Fixed copy/paste mistake --------- Co-authored-by: Iwan Kawrakow --- llama.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index 5401db00e..dad31cbcb 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2495,7 +2495,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K || quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) { int nx = tensor.ne.at(0); - int ny = tensor.ne.at(0); + int ny = tensor.ne.at(1); if (nx % QK_K != 0 || ny % QK_K != 0) { fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K); fprintf(stderr, "This is required to be able to use k-quants for now!\n"); @@ -2504,7 +2504,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } } if (tensor.name == "output.weight") { - new_type = GGML_TYPE_Q6_K; + int nx = tensor.ne.at(0); + int ny = tensor.ne.at(1); + if (nx % QK_K == 0 && ny % QK_K == 0) { + new_type = GGML_TYPE_Q6_K; + } } else if (tensor.name.find("attention.wv.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; From 23fc5c219a9aebd57c8af3fac454062cc4622980 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 19 Jun 2023 18:18:34 +0300 Subject: [PATCH 21/40] cmake : fix trailing whitespaces --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a598593b6..2846d9b94 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -505,7 +505,7 @@ if (GGML_SOURCES_CUDA) set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native") set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") endif() - + set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native") endif() From ba4e85a8339b9dd7cdffad31838235f2fe45a8ea Mon Sep 17 00:00:00 2001 From: l3utterfly Date: Mon, 19 Jun 2023 23:20:06 +0800 Subject: [PATCH 22/40] llama : use aligned memory during ggml_init call from loading saved sessions (#1934) * fixed issue: memory is not guaranteed to be aligned properly during ggml_init call from loading saved sessions * - removed commented out old code from fix - updated another instance of same issue below original --- llama.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/llama.cpp b/llama.cpp index dad31cbcb..4a7d01b32 100644 --- a/llama.cpp +++ b/llama.cpp @@ -3126,9 +3126,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { if (kv_size) { const size_t elt_size = ggml_element_size(kv_self.k); - char buffer[4096]; - - ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); + ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); ggml_cgraph gf{}; gf.n_threads = 1; @@ -3234,9 +3232,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { const size_t elt_size = ggml_element_size(kv_self.k); - char buffer[4096]; - - ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true }); + ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true }); ggml_cgraph gf{}; gf.n_threads = 1; From 18b35625c3c19c64b7818a12460ba5ddb006dfdc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 19 Jun 2023 20:43:30 +0300 Subject: [PATCH 23/40] ggml : fix bug in LBFGS optimizer (found by ggml tests) --- ggml.c | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml.c b/ggml.c index 14e08f9d6..4319683f5 100644 --- a/ggml.c +++ b/ggml.c @@ -18237,7 +18237,6 @@ GGML_API void ggml_opt_init( ggml_set_zero(opt->lbfgs.g); ggml_set_zero(opt->lbfgs.gp); ggml_set_zero(opt->lbfgs.d); - ggml_set_zero(opt->lbfgs.pf); if (opt->lbfgs.pf) { ggml_set_zero(opt->lbfgs.pf); } From 20568fe60f00155fa25e92eb3a7f6b911d557967 Mon Sep 17 00:00:00 2001 From: Henri Vasserman Date: Tue, 20 Jun 2023 01:12:39 +0300 Subject: [PATCH 24/40] [Fix] Reenable server embedding endpoint (#1937) * Add back embedding feature * Update README --- examples/server/README.md | 13 +++++++++-- examples/server/server.cpp | 44 +++++++++++++++++++++++++++++++++++++- 2 files changed, 54 insertions(+), 3 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index 474a28b20..fa95c0044 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -21,6 +21,7 @@ Command line options: - `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. - `--port`: Set the port to listen. Default: `8080`. +- `--embedding`: Enable embedding extraction, Default: disabled. ## Build @@ -119,14 +120,14 @@ node . `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.9). - `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. (default: 128, -1 = infinity). + `n_predict`: Set the number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: 128, -1 = infinity). `n_keep`: Specify the number of tokens from the initial prompt to retain when the model resets its internal context. By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the initial prompt. `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. - `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. + `prompt`: Provide a prompt. Internally, the prompt is compared, and it detects if a part has already been evaluated, and the remaining part will be evaluate. A space is inserted in the front like main.cpp does. `stop`: Specify a JSON array of stopping strings. These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). @@ -163,6 +164,14 @@ node . `content`: Set the text to tokenize. + Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`. + +- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. + + *Options:* + + `content`: Set the text to process. + ## More examples ### Interactive mode diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 12d4e2fa4..c0984aadb 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -254,6 +254,11 @@ struct llama_server_context { n_past += n_eval; } + if (params.n_predict == 0) { + has_next_token = false; + return llama_token_eos(); + } + // out of user input, sample next token const float temp = params.temp; const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k; @@ -419,6 +424,19 @@ struct llama_server_context { return token_text; } + + std::vector getEmbedding() { + static const int n_embd = llama_n_embd(ctx); + if (!params.embedding) { + LOG_WARNING("embedding disabled", { + { "params.embedding", params.embedding }, + }); + return std::vector(n_embd, 0.0f); + } + const float * data = llama_get_embeddings(ctx); + std::vector embedding(data, data + n_embd); + return embedding; + } }; static void server_print_usage(const char * argv0, const gpt_params & params, @@ -457,6 +475,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str()); fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port); fprintf(stderr, " -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); + fprintf(stderr, " --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); fprintf(stderr, "\n"); } @@ -603,6 +622,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams, params.use_mlock = true; } else if (arg == "--no-mmap") { params.use_mmap = false; + } else if (arg == "--embedding") { + params.embedding = true; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); server_print_usage(argv[0], default_params, default_sparams); @@ -646,6 +667,12 @@ static json format_generation_settings(llama_server_context & llama) { }; } +static json format_embedding_response(llama_server_context & llama) { + return json { + { "embedding", llama.getEmbedding() }, + }; +} + static json format_final_response(llama_server_context & llama, const std::string & content) { return json { { "content", content }, @@ -881,12 +908,27 @@ int main(int argc, char ** argv) { svr.Post("/tokenize", [&llama](const Request & req, Response & res) { const json body = json::parse(req.body); - const std::string content = body["content"].get(); + const std::string content = body.value("content", ""); const std::vector tokens = llama_tokenize(llama.ctx, content, false); const json data = format_tokenizer_response(tokens); return res.set_content(data.dump(), "application/json"); }); + svr.Post("/embedding", [&llama](const Request & req, Response & res) { + const json body = json::parse(req.body); + + llama.rewind(); + llama_reset_timings(llama.ctx); + llama.params.prompt = body.value("content", ""); + llama.params.n_predict = 0; + llama.loadPrompt(); + llama.beginCompletion(); + llama.doCompletion(); + + const json data = format_embedding_response(llama); + return res.set_content(data.dump(), "application/json"); + }); + svr.set_logger(log_server_request); svr.set_exception_handler([](const Request &, Response & res, std::exception_ptr ep) { From aacdbd40562684665b6f7b8ba6695b7a2088bbb0 Mon Sep 17 00:00:00 2001 From: Ettore Di Giacinto Date: Tue, 20 Jun 2023 03:24:39 +0200 Subject: [PATCH 25/40] llama : fix params struct slignment (#1936) * Workaround struct misalignment during value-copy Signed-off-by: mudler * Move booleans at the bottom of the structure Signed-off-by: mudler * Add comment Signed-off-by: mudler --------- Signed-off-by: mudler --- llama.cpp | 6 +++--- llama.h | 17 ++++++++--------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/llama.cpp b/llama.cpp index 4a7d01b32..e597f5048 100644 --- a/llama.cpp +++ b/llama.cpp @@ -925,21 +925,21 @@ static bool kv_cache_init( struct llama_context_params llama_context_default_params() { struct llama_context_params result = { + /*.seed =*/ -1, /*.n_ctx =*/ 512, /*.n_batch =*/ 512, /*.gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ {0}, + /*.progress_callback =*/ nullptr, + /*.progress_callback_user_data =*/ nullptr, /*.low_vram =*/ false, - /*.seed =*/ -1, /*.f16_kv =*/ true, /*.logits_all =*/ false, /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.embedding =*/ false, - /*.progress_callback =*/ nullptr, - /*.progress_callback_user_data =*/ nullptr, }; return result; diff --git a/llama.h b/llama.h index 1241ba6c0..0de530d45 100644 --- a/llama.h +++ b/llama.h @@ -71,28 +71,27 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); - struct llama_context_params { + struct llama_context_params { + int seed; // RNG seed, -1 for random int n_ctx; // text context int n_batch; // prompt processing batch size int n_gpu_layers; // number of layers to store in VRAM int main_gpu; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs - bool low_vram; // if true, reduce VRAM usage at the cost of performance - int seed; // RNG seed, -1 for random + // called with a progress value between 0 and 1, pass NULL to disable + llama_progress_callback progress_callback; + // context pointer passed to the progress callback + void * progress_callback_user_data; + // Keep the booleans together to avoid misalignment during copy-by-value. + bool low_vram; // if true, reduce VRAM usage at the cost of performance bool f16_kv; // use fp16 for KV cache bool logits_all; // the llama_eval() call computes all logits, not just the last one bool vocab_only; // only load the vocabulary, no weights bool use_mmap; // use mmap if possible bool use_mlock; // force system to keep model in RAM bool embedding; // embedding mode only - - // called with a progress value between 0 and 1, pass NULL to disable - llama_progress_callback progress_callback; - // context pointer passed to the progress callback - void * progress_callback_user_data; }; - // model file types enum llama_ftype { LLAMA_FTYPE_ALL_F32 = 0, From 2322ec223a21625dfe9bd73ee677444a98a24ac9 Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Tue, 20 Jun 2023 05:42:40 -0700 Subject: [PATCH 26/40] Fix typo (#1949) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2d05de333..8136e7064 100644 --- a/README.md +++ b/README.md @@ -378,7 +378,7 @@ Building the program with BLAS support may lead to some performance improvements ```sh git clone https://github.com/CNugteren/CLBlast.git mkdir CLBlast/build - cd CLBLast/build + cd CLBlast/build cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF cmake --build . --config Release cmake --install . --prefix /some/path From 049aa16b8c5c6d086246e4e6b9feb18de4fbd663 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 20 Jun 2023 19:05:54 +0300 Subject: [PATCH 27/40] readme : add link to p1 --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index 8136e7064..67012adab 100644 --- a/README.md +++ b/README.md @@ -9,12 +9,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++ **Hot topics:** +- p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1 - Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729 -- GPU support with Metal (Apple Silicon): https://github.com/ggerganov/llama.cpp/pull/1642 -- High-quality 2,3,4,5,6-bit quantization: https://github.com/ggerganov/llama.cpp/pull/1684 -- Multi-GPU support: https://github.com/ggerganov/llama.cpp/pull/1607 -- Training LLaMA models from scratch: https://github.com/ggerganov/llama.cpp/pull/1652 -- CPU threading improvements: https://github.com/ggerganov/llama.cpp/pull/1632
Table of Contents From fb98254f99d769fcbbf20966ef386abdb48ef601 Mon Sep 17 00:00:00 2001 From: Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com> Date: Thu, 22 Jun 2023 03:18:43 +0530 Subject: [PATCH 28/40] Fix typo in README.md (#1961) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 67012adab..ace588606 100644 --- a/README.md +++ b/README.md @@ -340,7 +340,7 @@ Building the program with BLAS support may lead to some performance improvements | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value 2 1 can improve performance for slow GPUs. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### CLBlast From bbca06e26949686d61a5126332680ba3cccf235c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Wed, 21 Jun 2023 23:49:25 +0200 Subject: [PATCH 29/40] cmake: revert CUDA arch default to 52, 61 if f16 (#1959) --- CMakeLists.txt | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2846d9b94..cc7560a7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -250,6 +250,15 @@ if (LLAMA_CUBLAS) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + if (LLAMA_CUDA_DMMV_F16) + set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics + else() + set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard + endif() + endif() + message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") + else() message(WARNING "cuBLAS not found") endif() @@ -493,22 +502,6 @@ if (BUILD_SHARED_LIBS) endif() endif() -if (GGML_SOURCES_CUDA) - message(STATUS "GGML CUDA sources found, configuring CUDA architecture") - set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "native") - set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - - set_property(TARGET ggml_static PROPERTY CUDA_ARCHITECTURES "native") - set_property(TARGET ggml_static PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - - if (BUILD_SHARED_LIBS) - set_property(TARGET ggml_shared PROPERTY CUDA_ARCHITECTURES "native") - set_property(TARGET ggml_shared PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto") - endif() - - set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES "native") -endif() - # # programs, examples and tests From 7487137227eb32ed9b12156338b865cb29b2dfd1 Mon Sep 17 00:00:00 2001 From: Erik Scholz Date: Thu, 22 Jun 2023 14:20:47 +0200 Subject: [PATCH 30/40] rework convert.py to read hyper-parameters from config.json (#1958) * Read hyper-parameters from HuggingFace-transformer config.json, if they exist, and fall back to guessing, like before otherwise. This allows converting open_llama 3B and other non-standard model designs. --- convert.py | 91 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 22 deletions(-) diff --git a/convert.py b/convert.py index 265c41fa0..de6c39c67 100644 --- a/convert.py +++ b/convert.py @@ -130,6 +130,14 @@ TENSORS_LIST = make_tensors_list() TENSORS_SET = set(TENSORS_LIST) +def find_n_mult(n_ff: int, n_embd: int) -> int: + # hardcoded magic range + for n_mult in range(256, 1, -1): + calc_ff = (((8*n_embd) // 3 + n_mult - 1) // n_mult)*n_mult + if calc_ff == n_ff: + return n_mult + return 1 + @dataclass class Params: n_vocab: int @@ -137,21 +145,61 @@ class Params: n_mult: int n_head: int n_layer: int - file_type: GGMLFileType @staticmethod - def guessed(model: 'LazyModel', file_type: GGMLFileType) -> 'Params': - n_vocab, n_embd = model["tok_embeddings.weight"].shape + def guessed(model: 'LazyModel') -> 'Params': + # try transformer naming first + n_vocab, n_embd = model["model.embed_tokens.weight"].shape if "model.embed_tokens.weight" in model else model["tok_embeddings.weight"].shape + + # try transformer naming first + if "model.layers.0.self_attn.q_proj.weight" in model: + n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model) + else: + n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) + + n_head=n_embd // 128 # guessed return Params( n_vocab=n_vocab, n_embd=n_embd, n_mult=256, - n_head=n_embd // 128, - n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model), - file_type=file_type, + n_head=n_head, + n_layer=n_layer, ) + @staticmethod + def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params': + config = json.load(open(config_path)) + + n_vocab = config["vocab_size"]; + n_embd = config["hidden_size"]; + n_head = config["num_attention_heads"]; + n_layer = config["num_hidden_layers"]; + n_ff = config["intermediate_size"]; + + n_mult = find_n_mult(n_ff, n_embd); + + return Params( + n_vocab=n_vocab, + n_embd=n_embd, + n_mult=n_mult, + n_head=n_head, + n_layer=n_layer, + ) + + @staticmethod + def load(model_plus: 'ModelPlus') -> 'Params': + orig_config_path = model_plus.paths[0].parent / "params.json" + hf_transformer_config_path = model_plus.paths[0].parent / "config.json" + + if hf_transformer_config_path.exists(): + params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path) + else: + params = Params.guessed(model_plus.model) + + print(f'params: n_vocab:{params.n_vocab} n_embd:{params.n_embd} n_mult:{params.n_mult} n_head:{params.n_head} n_layer:{params.n_layer}') + return params + class SentencePieceVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Optional[Path]) -> None: @@ -595,18 +643,17 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int) -> LazyTensor: return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}) ' + lazy_tensor.description) -def convert_transformers_to_orig(model: LazyModel) -> LazyModel: +def convert_transformers_to_orig(model: LazyModel, params: Params) -> LazyModel: out: LazyModel = {} out["tok_embeddings.weight"] = model["model.embed_tokens.weight"] out["norm.weight"] = model["model.norm.weight"] out["output.weight"] = model["lm_head.weight"] - n_head = model["model.layers.0.self_attn.q_proj.weight"].shape[1] // 128 for i in itertools.count(): if f"model.layers.{i}.self_attn.q_proj.weight" not in model: break - out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], n_head) - out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], n_head) + out[f"layers.{i}.attention.wq.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head) + out[f"layers.{i}.attention.wk.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head) out[f"layers.{i}.attention.wv.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] out[f"layers.{i}.attention.wo.weight"] = model[f"model.layers.{i}.self_attn.o_proj.weight"] @@ -920,7 +967,7 @@ class OutputFile: def __init__(self, fname_out: Path) -> None: self.fout = open(fname_out, "wb") - def write_file_header(self, params: Params) -> None: + def write_file_header(self, params: Params, file_type: GGMLFileType) -> None: self.fout.write(b"ggjt"[::-1]) # magic values = [ 1, # file version @@ -930,7 +977,7 @@ class OutputFile: params.n_head, params.n_layer, params.n_embd // params.n_head, # rot (obsolete) - params.file_type.value, + file_type.value, ] self.fout.write(struct.pack("i" * len(values), *values)) @@ -958,10 +1005,10 @@ class OutputFile: of.fout.close() @staticmethod - def write_all(fname_out: Path, params: Params, model: LazyModel, vocab: Vocab) -> None: + def write_all(fname_out: Path, params: Params, file_type: GGMLFileType, model: LazyModel, vocab: Vocab) -> None: check_vocab_size(params, vocab) of = OutputFile(fname_out) - of.write_file_header(params) + of.write_file_header(params, file_type) print("Writing vocab...") of.write_vocab(vocab) @@ -997,11 +1044,11 @@ def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFi raise Exception(f"Unexpected combination of types: {name_to_type}") -def do_necessary_conversions(model: LazyModel) -> LazyModel: +def do_necessary_conversions(model: LazyModel, params: Params) -> LazyModel: model = handle_quantization(model) if "lm_head.weight" in model: - model = convert_transformers_to_orig(model) + model = convert_transformers_to_orig(model, params) model = filter_and_sort_tensors(model) return model @@ -1107,14 +1154,14 @@ def load_vocab(path: Path) -> SentencePieceVocab: return SentencePieceVocab(path, added_tokens_path if added_tokens_path.exists() else None) -def default_outfile(model_paths: List[Path], params: Params) -> Path: +def default_outfile(model_paths: List[Path], file_type: GGMLFileType) -> Path: namestr = { GGMLFileType.AllF32: "f32", GGMLFileType.MostlyF16: "f16", GGMLFileType.MostlyQ4_0: "q4_0", GGMLFileType.MostlyQ4_1: "q4_1", GGMLFileType.PerLayerIsQ4_1: "q4_1", - }[params.file_type] + }[file_type] ret = model_paths[0].parent / f"ggml-model-{namestr}.bin" if ret in model_paths: sys.stderr.write( @@ -1164,13 +1211,13 @@ def main(args_in: Optional[List[str]] = None) -> None: else: vocab_dir = args.vocab_dir if args.vocab_dir else model_plus.paths[0].parent vocab = load_vocab(vocab_dir) + params = Params.load(model_plus) model = model_plus.model - model = do_necessary_conversions(model) + model = do_necessary_conversions(model, params) output_type = pick_output_type(model, args.outtype) model = convert_to_output_type(model, output_type) - params = Params.guessed(model, output_type) - outfile = args.outfile or default_outfile(model_plus.paths, params) - OutputFile.write_all(outfile, params, model, vocab) + outfile = args.outfile or default_outfile(model_plus.paths, output_type) + OutputFile.write_all(outfile, params, output_type, model, vocab) print(f"Wrote {outfile}") From d7b7484f74d486f77feb4c0b7af7e1718ed91651 Mon Sep 17 00:00:00 2001 From: eiery <19350831+eiery@users.noreply.github.com> Date: Fri, 23 Jun 2023 04:38:01 -0400 Subject: [PATCH 31/40] Add OpenLLaMA instructions to the README (#1954) * add openllama to readme --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index ace588606..b09498be6 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
  • Quantization
  • Interactive mode
  • Instruction mode with Alpaca
  • +
  • Using OpenLLaMA
  • Using GPT4All
  • Using Pygmalion 7B & Metharme 7B
  • Obtaining the Facebook LLaMA original model and Stanford Alpaca model data
  • @@ -543,6 +544,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach. > ``` +### Using [OpenLLaMA](https://github.com/openlm-research/open_llama) + +OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights. + +- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face. +- Convert the model to ggml FP16 format using `python convert.py ` + ### Using [GPT4All](https://github.com/nomic-ai/gpt4all) - Obtain the `tokenizer.model` file from LLaMA model and put it to `models` From 527b6fba1d237befb324fd846bda7418c0fa394d Mon Sep 17 00:00:00 2001 From: Didzis Gosko Date: Sat, 24 Jun 2023 11:47:58 +0300 Subject: [PATCH 32/40] llama : make model stateless and context stateful (llama_state) (#1797) * llama : make model stateless and context stateful * llama : minor cleanup * llama : update internal API declaration * Apply suggestions from code review fix style Co-authored-by: Georgi Gerganov * Missing model memory release * Fix style * Add deprecated warning for public API function llama_init_from_file * Update public API use cases: move away from deprecated llama_init_from_file * Deprecate public API function llama_apply_lora_from_file --------- Co-authored-by: Georgi Gerganov --- examples/common.cpp | 24 ++- examples/common.h | 3 +- examples/embedding/embedding.cpp | 6 +- examples/main/main.cpp | 8 +- examples/perplexity/perplexity.cpp | 6 +- examples/quantize-stats/quantize-stats.cpp | 15 +- examples/save-load-state/save-load-state.cpp | 29 ++- examples/server/server.cpp | 9 +- examples/simple/simple.cpp | 8 +- .../train-text-from-scratch.cpp | 5 +- llama.cpp | 172 ++++++++++++------ llama.h | 35 +++- tests/test-tokenizer-0.cpp | 16 +- 13 files changed, 244 insertions(+), 92 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index fed24e027..6ac484555 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -536,7 +536,7 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s return res; } -struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { +std::tuple llama_init_from_gpt_params(const gpt_params & params) { auto lparams = llama_context_default_params(); lparams.n_ctx = params.n_ctx; @@ -552,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) { lparams.logits_all = params.perplexity; lparams.embedding = params.embedding; - llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams); - - if (lctx == NULL) { + llama_model * model = llama_load_model_from_file(params.model.c_str(), lparams); + if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); - return NULL; + return std::make_tuple(nullptr, nullptr); + } + + llama_context * lctx = llama_new_context_with_model(model, lparams); + if (lctx == NULL) { + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } if (!params.lora_adapter.empty()) { - int err = llama_apply_lora_from_file(lctx, + int err = llama_model_apply_lora_from_file(model, params.lora_adapter.c_str(), params.lora_base.empty() ? NULL : params.lora_base.c_str(), params.n_threads); if (err != 0) { fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__); - return NULL; + llama_free(lctx); + llama_free_model(model); + return std::make_tuple(nullptr, nullptr); } } - return lctx; + return std::make_tuple(model, lctx); } void console_init(console_state & con_st) { diff --git a/examples/common.h b/examples/common.h index 6c2953cb2..713320179 100644 --- a/examples/common.h +++ b/examples/common.h @@ -9,6 +9,7 @@ #include #include #include +#include #if !defined (_WIN32) #include @@ -95,7 +96,7 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s // Model utils // -struct llama_context * llama_init_from_gpt_params(const gpt_params & params); +std::tuple llama_init_from_gpt_params(const gpt_params & params); // // Console utils diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index 860f99f67..369eac1d1 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -37,11 +37,12 @@ int main(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; // load the model - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } @@ -90,6 +91,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 941312f9c..c1e6bf126 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -107,12 +107,13 @@ int main(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; g_ctx = &ctx; // load the model and apply lora adapter, if any - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } @@ -139,6 +140,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } @@ -147,6 +149,7 @@ int main(int argc, char ** argv) { if (params.export_cgraph) { llama_eval_export(ctx, "llama.ggml"); llama_free(ctx); + llama_free_model(model); return 0; } @@ -666,6 +669,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index ae8cfe0af..b59f5971e 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -149,11 +149,12 @@ int main(int argc, char ** argv) { llama_init_backend(); + llama_model * model; llama_context * ctx; // load the model and apply lora adapter, if any - ctx = llama_init_from_gpt_params(params); - if (ctx == NULL) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { fprintf(stderr, "%s: error: unable to load model\n", __func__); return 1; } @@ -169,6 +170,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); + llama_free_model(model); return 0; } diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6b8018ee2..9cea472de 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -320,6 +320,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "Loading model\n"); const int64_t t_main_start_us = ggml_time_us(); + llama_model * model; llama_context * ctx; { @@ -330,10 +331,18 @@ int main(int argc, char ** argv) { lparams.f16_kv = false; lparams.use_mlock = false; - ctx = llama_init_from_file(params.model.c_str(), lparams); + model = llama_load_model_from_file(params.model.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); + fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str()); + llama_free_model(model); return 1; } } @@ -357,6 +366,7 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: error: Quantization should be tested with a float model, " "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type); llama_free(ctx); + llama_free_model(model); return 1; } included_layers++; @@ -415,6 +425,7 @@ int main(int argc, char ** argv) { llama_free(ctx); + llama_free_model(model); // report timing { const int64_t t_main_end_us = ggml_time_us(); diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index da4d37ad0..4c8688503 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -35,12 +35,22 @@ int main(int argc, char ** argv) { auto last_n_tokens_data = std::vector(params.repeat_last_n, 0); // init - auto ctx = llama_init_from_file(params.model.c_str(), lparams); + auto model = llama_load_model_from_file(params.model.c_str(), lparams); + if (model == nullptr) { + return 1; + } + auto ctx = llama_new_context_with_model(model, lparams); + if (ctx == nullptr) { + llama_free_model(model); + return 1; + } auto tokens = std::vector(params.n_ctx); auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true); if (n_prompt_tokens < 1) { fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); + llama_free(ctx); + llama_free_model(model); return 1; } @@ -84,6 +94,8 @@ int main(int argc, char ** argv) { printf("%s", next_token_str); if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + llama_free(ctx); + llama_free_model(model); return 1; } n_past += 1; @@ -91,23 +103,27 @@ int main(int argc, char ** argv) { printf("\n\n"); - // free old model + // free old context llama_free(ctx); - // load new model - auto ctx2 = llama_init_from_file(params.model.c_str(), lparams); + // make new context + auto ctx2 = llama_new_context_with_model(model, lparams); // Load state (rng, logits, embedding and kv_cache) from file { FILE *fp_read = fopen("dump_state.bin", "rb"); if (state_size != llama_get_state_size(ctx2)) { fprintf(stderr, "\n%s : failed to validate state size\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } const size_t ret = fread(state_mem, 1, state_size, fp_read); if (ret != state_size) { fprintf(stderr, "\n%s : failed to read state\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } @@ -138,6 +154,8 @@ int main(int argc, char ** argv) { printf("%s", next_token_str); if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + llama_free(ctx2); + llama_free_model(model); return 1; } n_past += 1; @@ -145,5 +163,8 @@ int main(int argc, char ** argv) { printf("\n\n"); + llama_free(ctx2); + llama_free_model(model); + return 0; } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c0984aadb..de22d3013 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -115,6 +115,7 @@ struct llama_server_context { std::vector embd; std::vector last_n_tokens; + llama_model * model = nullptr; llama_context * ctx = nullptr; gpt_params params; @@ -130,6 +131,10 @@ struct llama_server_context { llama_free(ctx); ctx = nullptr; } + if (model) { + llama_free_model(model); + model = nullptr; + } } void rewind() { @@ -150,8 +155,8 @@ struct llama_server_context { bool loadModel(const gpt_params & params_) { params = params_; - ctx = llama_init_from_gpt_params(params); - if (ctx == nullptr) { + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr) { LOG_ERROR("unable to load model", { { "model", params_.model } }); return false; } diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 76f991cdc..fc45c9340 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -68,11 +68,12 @@ int main(int argc, char ** argv) llama_init_backend(); - llama_context * ctx ; + llama_model * model; + llama_context * ctx; - ctx = llama_init_from_gpt_params( params ); + std::tie(model, ctx) = llama_init_from_gpt_params( params ); - if ( ctx == NULL ) + if ( model == NULL ) { fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); return 1; @@ -170,6 +171,7 @@ int main(int argc, char ** argv) } // wend of main loop llama_free( ctx ); + llama_free_model( model ); return 0; } diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 7ec85951a..61c829e5c 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -3054,7 +3054,8 @@ int main(int argc, char ** argv) { struct llama_context_params llama_params = llama_context_default_params(); llama_params.vocab_only = true; - struct llama_context * lctx = llama_init_from_file(params.fn_vocab_model, llama_params); + struct llama_model * lmodel = llama_load_model_from_file(params.fn_vocab_model, llama_params); + struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params); struct llama_vocab vocab; { @@ -3395,6 +3396,8 @@ int main(int argc, char ** argv) { delete[] compute_addr; delete[] compute_buf_0; delete[] compute_buf_1; + llama_free(lctx); + llama_free_model(lmodel); ggml_free(model.ctx); return 0; diff --git a/llama.cpp b/llama.cpp index e597f5048..a528eef4a 100644 --- a/llama.cpp +++ b/llama.cpp @@ -182,6 +182,19 @@ struct llama_kv_cache { } }; +struct llama_vocab { + using id = int32_t; + using token = std::string; + + struct token_score { + token tok; + float score; + }; + + std::unordered_map token_to_id; + std::vector id_to_token; +}; + struct llama_model { e_model type = MODEL_UNKNOWN; @@ -198,10 +211,6 @@ struct llama_model { // context struct ggml_context * ctx = NULL; - // key + value cache for the self attention - // TODO: move to llama_state - struct llama_kv_cache kv_self; - // the model memory buffer llama_ctx_buffer buf; @@ -215,6 +224,11 @@ struct llama_model { // for quantize-stats only std::vector> tensors_by_name; + int64_t t_load_us = 0; + int64_t t_start_us = 0; + + llama_vocab vocab; + ~llama_model() { if (ctx) { ggml_free(ctx); @@ -233,24 +247,11 @@ struct llama_model { } }; -struct llama_vocab { - using id = int32_t; - using token = std::string; - - struct token_score { - token tok; - float score; - }; - - std::unordered_map token_to_id; - std::vector id_to_token; -}; - struct llama_context { + llama_context(const llama_model & model, const llama_vocab & vocab) : model(model), vocab(vocab), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {} + std::mt19937 rng; - int64_t t_load_us = 0; - int64_t t_start_us = 0; bool has_evaluated_once = false; int64_t t_sample_us = 0; @@ -261,8 +262,16 @@ struct llama_context { int32_t n_eval = 0; // number of eval calls int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1) - llama_model model; - llama_vocab vocab; + const llama_model & model; + const llama_vocab & vocab; + + bool model_owner = false; + + int64_t t_load_us; + int64_t t_start_us; + + // key + value cache for the self attention + struct llama_kv_cache kv_self; size_t mem_per_token = 0; @@ -1033,7 +1042,8 @@ static const char *llama_model_type_name(e_model type) { static void llama_model_load_internal( const std::string & fname, - llama_context & lctx, + llama_model & model, + llama_vocab & vocab, int n_ctx, int n_batch, int n_gpu_layers, @@ -1047,12 +1057,11 @@ static void llama_model_load_internal( llama_progress_callback progress_callback, void * progress_callback_user_data) { - lctx.t_start_us = ggml_time_us(); + model.t_start_us = ggml_time_us(); std::unique_ptr ml(new llama_model_loader(fname, use_mmap, vocab_only)); - lctx.vocab = std::move(ml->file_loaders.at(0)->vocab); - auto & model = lctx.model; + vocab = std::move(ml->file_loaders.at(0)->vocab); model.hparams = ml->file_loaders.at(0)->hparams; model.n_gpu_layers = n_gpu_layers; llama_file_version file_version = ml->file_loaders.at(0)->file_version; @@ -1122,15 +1131,15 @@ static void llama_model_load_internal( // create the ggml context { - lctx.model.buf.resize(ctx_size); + model.buf.resize(ctx_size); if (use_mlock) { - lctx.model.mlock_buf.init(lctx.model.buf.addr); - lctx.model.mlock_buf.grow_to(lctx.model.buf.size); + model.mlock_buf.init(model.buf.addr); + model.mlock_buf.grow_to(model.buf.size); } struct ggml_init_params params = { - /*.mem_size =*/ lctx.model.buf.size, - /*.mem_buffer =*/ lctx.model.buf.addr, + /*.mem_size =*/ model.buf.size, + /*.mem_buffer =*/ model.buf.addr, /*.no_alloc =*/ ml->use_mmap, }; @@ -1311,7 +1320,7 @@ static void llama_model_load_internal( } #endif - ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL); + ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL); if (progress_callback) { progress_callback(1.0f, progress_callback_user_data); @@ -1321,12 +1330,13 @@ static void llama_model_load_internal( // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration - lctx.t_load_us = ggml_time_us() - lctx.t_start_us; + model.t_load_us = ggml_time_us() - model.t_start_us; } static bool llama_model_load( const std::string & fname, - llama_context & lctx, + llama_model & model, + llama_vocab & vocab, int n_ctx, int n_batch, int n_gpu_layers, @@ -1340,7 +1350,7 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, + llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { @@ -1378,7 +1388,7 @@ static bool llama_eval_internal( const auto & model = lctx.model; const auto & hparams = model.hparams; - const auto & kv_self = model.kv_self; + const auto & kv_self = lctx.kv_self; LLAMA_ASSERT(!!kv_self.ctx); @@ -1726,7 +1736,7 @@ static bool llama_eval_internal( //memcpy(embd_w.data(), ggml_get_data(cur), sizeof(float)*n_vocab*N); // update kv token count - lctx.model.kv_self.n = n_past + N; + lctx.kv_self.n = n_past + N; // extract logits { @@ -2634,12 +2644,39 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // interface implementation // -struct llama_context * llama_init_from_file( +struct llama_model * llama_load_model_from_file( const char * path_model, struct llama_context_params params) { ggml_time_init(); - llama_context * ctx = new llama_context; + llama_model * model = new llama_model; + + ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; + + if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, + params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, + params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { + delete model; + fprintf(stderr, "%s: failed to load model\n", __func__); + return nullptr; + } + + return model; +} + +void llama_free_model(struct llama_model * model) { + delete model; +} + +struct llama_context * llama_new_context_with_model( + struct llama_model * model, + struct llama_context_params params) { + + if (!model) { + return nullptr; + } + + llama_context * ctx = new llama_context(*model, model->vocab); if (params.seed < 0) { params.seed = time(NULL); @@ -2667,24 +2704,16 @@ struct llama_context * llama_init_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu, - params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock, - params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { - fprintf(stderr, "%s: failed to load model\n", __func__); - llama_free(ctx); - return nullptr; - } - // reserve memory for context buffers if (!params.vocab_only) { - if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { + if (!kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) { fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; } { - const size_t memory_size = ggml_nbytes(ctx->model.kv_self.k) + ggml_nbytes(ctx->model.kv_self.v); + const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v); fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } @@ -2736,8 +2765,8 @@ struct llama_context * llama_init_from_file( LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); - LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->model.kv_self.buf.addr, ctx->model.kv_self.buf.size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0)); + LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0)); LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0)); @@ -2748,7 +2777,23 @@ struct llama_context * llama_init_from_file( return ctx; } +struct llama_context * llama_init_from_file( + const char * path_model, + struct llama_context_params params) { + + struct llama_model * model = llama_load_model_from_file(path_model, params); + if (!model) { + return nullptr; + } + struct llama_context * ctx = llama_new_context_with_model(model, params); + ctx->model_owner = true; + return ctx; +} + void llama_free(struct llama_context * ctx) { + if (ctx->model_owner) { + delete &ctx->model; + } delete ctx; } @@ -2765,11 +2810,9 @@ int llama_model_quantize( } } -int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { +int llama_apply_lora_from_file_internal(const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads) { fprintf(stderr, "%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora); - auto & model = ctx->model; - const int64_t t_start_lora_us = ggml_time_us(); auto fin = std::ifstream(path_lora, std::ios::binary); @@ -3012,7 +3055,16 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char * int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) { try { - return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads); + return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads); + } catch (const std::exception & err) { + fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); + return 1; + } +} + +int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) { + try { + return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads); } catch (const std::exception & err) { fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.what()); return 1; @@ -3020,7 +3072,7 @@ int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lor } int llama_get_kv_cache_token_count(const struct llama_context * ctx) { - return ctx->model.kv_self.n; + return ctx->kv_self.n; } #define LLAMA_MAX_RNG_STATE (64*1024) @@ -3045,7 +3097,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { const size_t s_embedding = ctx->embedding.size() * sizeof(float); const size_t s_kv_size = sizeof(size_t); const size_t s_kv_ntok = sizeof(int); - const size_t s_kv = ctx->model.kv_self.buf.size; + const size_t s_kv = ctx->kv_self.buf.size; const size_t s_total = ( + s_rng_size @@ -3111,7 +3163,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { // copy kv cache { - const auto & kv_self = ctx->model.kv_self; + const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; const int n_layer = hparams.n_layer; const int n_embd = hparams.n_embd; @@ -3215,7 +3267,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { // set kv cache { - const auto & kv_self = ctx->model.kv_self; + const auto & kv_self = ctx->kv_self; const auto & hparams = ctx->model.hparams; const int n_layer = hparams.n_layer; const int n_embd = hparams.n_embd; @@ -3259,7 +3311,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { ggml_free(cpy_ctx); } - ctx->model.kv_self.n = kv_ntok; + ctx->kv_self.n = kv_ntok; } const size_t nread = inp - src; @@ -3506,6 +3558,6 @@ const char * llama_print_system_info(void) { } // For internal test use -std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { +const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx) { return ctx->model.tensors_by_name; } diff --git a/llama.h b/llama.h index 0de530d45..a833a7f4d 100644 --- a/llama.h +++ b/llama.h @@ -26,6 +26,14 @@ # define LLAMA_API #endif +#ifdef __GNUC__ +# define DEPRECATED(func, hint) func __attribute__((deprecated(hint))) +#elif defined(_MSC_VER) +# define DEPRECATED(func, hint) __declspec(deprecated(hint)) func +#else +# define DEPRECATED(func, hint) func +#endif + #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' @@ -53,6 +61,7 @@ extern "C" { // TODO: show sample usage // + struct llama_model; struct llama_context; typedef int llama_token; @@ -136,12 +145,23 @@ extern "C" { LLAMA_API int64_t llama_time_us(); + LLAMA_API struct llama_model * llama_load_model_from_file( + const char * path_model, + struct llama_context_params params); + + LLAMA_API void llama_free_model(struct llama_model * model); + + LLAMA_API struct llama_context * llama_new_context_with_model( + struct llama_model * model, + struct llama_context_params params); + // Various functions for loading a ggml llama model. // Allocate (almost) all memory needed for the model. // Return NULL on failure - LLAMA_API struct llama_context * llama_init_from_file( + LLAMA_API DEPRECATED(struct llama_context * llama_init_from_file( const char * path_model, - struct llama_context_params params); + struct llama_context_params params), + "please use llama_load_model_from_file combined with llama_new_context_with_model instead"); // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); @@ -158,8 +178,15 @@ extern "C" { // The model needs to be reloaded before applying a new adapter, otherwise the adapter // will be applied on top of the previous one // Returns 0 on success - LLAMA_API int llama_apply_lora_from_file( + LLAMA_API DEPRECATED(int llama_apply_lora_from_file( struct llama_context * ctx, + const char * path_lora, + const char * path_base_model, + int n_threads), + "please use llama_model_apply_lora_from_file instead"); + + LLAMA_API int llama_model_apply_lora_from_file( + const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads); @@ -310,7 +337,7 @@ extern "C" { #include struct ggml_tensor; -std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); +const std::vector>& llama_internal_get_tensor_map(struct llama_context * ctx); #endif diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index ab1538a0c..20abe7100 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -28,6 +28,7 @@ int main(int argc, char **argv) { fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + llama_model * model; llama_context * ctx; // load the vocab @@ -36,10 +37,18 @@ int main(int argc, char **argv) { lparams.vocab_only = true; - ctx = llama_init_from_file(fname.c_str(), lparams); + model = llama_load_model_from_file(fname.c_str(), lparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + ctx = llama_new_context_with_model(model, lparams); if (ctx == NULL) { fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); return 1; } } @@ -48,6 +57,8 @@ int main(int argc, char **argv) { if (n_vocab != 32000) { fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab); + llama_free_model(model); + llama_free(ctx); return 2; } @@ -77,10 +88,13 @@ int main(int argc, char **argv) { } fprintf(stderr, "\n"); + llama_free_model(model); + llama_free(ctx); return 3; } } + llama_free_model(model); llama_free(ctx); return 0; From b061ba9e2a7a2c335a200df8c11aed5e31e4ccbb Mon Sep 17 00:00:00 2001 From: Alex Renda Date: Sat, 24 Jun 2023 03:15:01 -0700 Subject: [PATCH 33/40] llama : fix top-p sampling to match the canonical definition (#1953) * Fix top-p sampling to match the standard definition (smallest set that has probability mass at least p, not largest set with probability mass less than p) * top-p: correct gt to gte * add test for correct top-p behavior --- llama.cpp | 7 ++++--- tests/test-sampling.cpp | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/llama.cpp b/llama.cpp index a528eef4a..ac22a48f8 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2015,9 +2015,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can for (size_t i = 0; i < candidates->size; ++i) { cum_sum += candidates->data[i].p; - // Check if the running sum is greater than p or if we have kept at least min_keep tokens - if (cum_sum > p && i >= min_keep) { - last_idx = i; + // Check if the running sum is at least p or if we have kept at least min_keep tokens + // we set the last index to i+1 to indicate that the current iterate should be included in the set + if (cum_sum >= p && i + 1 >= min_keep) { + last_idx = i + 1; break; } } diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 5d693f7b5..64f9455d7 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -181,6 +181,7 @@ int main(void) { test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f); + test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f); From 235b610d650cbfed6dbd5d671f750d35fc18cd7d Mon Sep 17 00:00:00 2001 From: Alberto <57916483+albbus-stack@users.noreply.github.com> Date: Sat, 24 Jun 2023 12:32:13 +0200 Subject: [PATCH 34/40] readme : fixed termux instructions (#1973) --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b09498be6..10462c6b0 100644 --- a/README.md +++ b/README.md @@ -680,12 +680,13 @@ Upon completion of the aforementioned steps, you will have successfully compiled ``` GGML_OPENCL_PLATFORM=0 GGML_OPENCL_DEVICE=0 -export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH -./main (...) +export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH ``` For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle. +Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script. + ### Docker #### Prerequisites From 11da1a85cd69af84b5861134738c7e9e20907470 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Jun 2023 13:38:18 +0300 Subject: [PATCH 35/40] readme : fix whitespaces --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 10462c6b0..6aa6ce319 100644 --- a/README.md +++ b/README.md @@ -685,7 +685,7 @@ export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle. -Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script. +Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script. ### Docker From f2c754e1c38936fdde74e4848ac468a696eb73c6 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 24 Jun 2023 12:57:18 +0200 Subject: [PATCH 36/40] ggml : improve ggml_graph_dump_dot, add ggml_format_name (#1978) * Improve ggml_graph_dump_dot, add ggml_format_name * add more automatic names to view ops * fix name of copies --- ggml.c | 135 ++++++++++++++++++++++++++++++++++++++++----------------- ggml.h | 1 + 2 files changed, 97 insertions(+), 39 deletions(-) diff --git a/ggml.c b/ggml.c index 4319683f5..ef9e8585d 100644 --- a/ggml.c +++ b/ggml.c @@ -24,6 +24,7 @@ #include #include #include +#include #ifdef GGML_USE_METAL #include @@ -4734,10 +4735,19 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam return tensor; } +struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) { + va_list args; + va_start(args, fmt); + vsnprintf(tensor->name, sizeof(tensor->name), fmt, args); + va_end(args); + return tensor; +} + struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, const struct ggml_tensor * src) { struct ggml_tensor * result = ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data); + ggml_format_name(result, "%s (view)", src->name); result->nb[0] = src->nb[0]; result->nb[1] = src->nb[1]; @@ -5899,6 +5909,11 @@ struct ggml_tensor * ggml_cpy_impl( // make a view of the destination struct ggml_tensor * result = ggml_view_tensor(ctx, b); + if (strlen(b->name) > 0) { + ggml_format_name(result, "%s (copy of %s)", b->name, a->name); + } else { + ggml_format_name(result, "%s (copy)", a->name); + } result->op = GGML_OP_CPY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5935,6 +5950,7 @@ struct ggml_tensor * ggml_cont_impl( } struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + ggml_format_name(result, "%s (cont)", a->name); result->op = GGML_OP_CONT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -5978,6 +5994,7 @@ struct ggml_tensor * ggml_reshape( } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6002,6 +6019,7 @@ struct ggml_tensor * ggml_reshape_1d( const int64_t ne[1] = { ne0 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6027,6 +6045,7 @@ struct ggml_tensor * ggml_reshape_2d( const int64_t ne[2] = { ne0, ne1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6053,6 +6072,7 @@ struct ggml_tensor * ggml_reshape_3d( const int64_t ne[3] = { ne0, ne1, ne2 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6081,6 +6101,7 @@ struct ggml_tensor * ggml_reshape_4d( const int64_t ne[4] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, a->data); + ggml_format_name(result, "%s (reshaped)", a->name); result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; @@ -6105,10 +6126,12 @@ struct ggml_tensor * ggml_view_1d( } struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6141,10 +6164,12 @@ struct ggml_tensor * ggml_view_2d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6183,10 +6208,12 @@ struct ggml_tensor * ggml_view_3d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6227,10 +6254,12 @@ struct ggml_tensor * ggml_view_4d( const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 }; struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); + ggml_format_name(result, "%s (view)", a->name); ggml_scratch_save(ctx); struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); + ggml_set_name(offs, "offset"); memcpy(offs->data, &offset, 2*sizeof(int32_t)); ggml_scratch_load(ctx); @@ -6276,6 +6305,7 @@ struct ggml_tensor * ggml_permute( } struct ggml_tensor * result = ggml_view_tensor(ctx, a); + ggml_format_name(result, "%s (permuted)", a->name); int ne[GGML_MAX_DIMS]; int nb[GGML_MAX_DIMS]; @@ -6335,6 +6365,7 @@ struct ggml_tensor * ggml_transpose( } struct ggml_tensor * result = ggml_view_tensor(ctx, a); + ggml_format_name(result, "%s (transposed)", a->name); result->ne[0] = a->ne[1]; result->ne[1] = a->ne[0]; @@ -16004,7 +16035,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES); if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "leaf_%d", cgraph->n_leafs); + ggml_format_name(node, "leaf_%d", cgraph->n_leafs); } cgraph->leafs[cgraph->n_leafs] = node; @@ -16013,7 +16044,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES); if (strlen(node->name) == 0) { - snprintf(node->name, sizeof(node->name), "node_%d", cgraph->n_nodes); + ggml_format_name(node, "node_%d", cgraph->n_nodes); } cgraph->nodes[cgraph->n_nodes] = node; @@ -17397,6 +17428,26 @@ static struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgr return NULL; } +static void ggml_graph_dump_dot_node_edge(FILE * fp, const struct ggml_cgraph * gb, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { + struct ggml_tensor * gparent = ggml_graph_get_parent(gb, node); + struct ggml_tensor * gparent0 = ggml_graph_get_parent(gb, parent); + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"%s\"; ]\n", + gparent0 ? (void *) gparent0 : (void *) parent, + gparent0 ? "g" : "x", + gparent ? (void *) gparent : (void *) node, + gparent ? "g" : "x", + gparent ? "empty" : "vee", + gparent ? "dashed" : "solid", + label); +} + +static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, struct ggml_tensor * parent, const char * label) { + fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"%s\"; ]\n", + (void *) parent, "x", + (void *) node, "x", + label); +} + void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) { char color[16]; @@ -17432,7 +17483,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph (void *) node, color); if (strlen(node->name) > 0) { - fprintf(fp, "%s |", node->name); + fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type)); + } else { + fprintf(fp, "(%s)|", ggml_type_name(node->type)); } if (node->n_dims == 2) { @@ -17441,7 +17494,6 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph fprintf(fp, "%d [%" PRId64 ", %" PRId64 ", %" PRId64 "] | %s", i, node->ne[0], node->ne[1], node->ne[2], GGML_OP_SYMBOL[node->op]); } - if (node->grad) { fprintf(fp, " | %s\"; ]\n", GGML_OP_SYMBOL[node->grad->op]); } else { @@ -17460,18 +17512,29 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph (void *) node, color); if (strlen(node->name) > 0) { - fprintf(fp, "%s | ", node->name); + fprintf(fp, "%s (%s)|", node->name, ggml_type_name(node->type)); + } else { + fprintf(fp, "(%s)|", ggml_type_name(node->type)); } - if (ggml_nelements(node) == 1) { - if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { - fprintf(fp, "%d", ggml_get_i32_1d(node, 0)); + + fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); + if (ggml_nelements(node) < 5) { + fprintf(fp, " | ("); + for (int j = 0; j < ggml_nelements(node); j++) { + if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { + fprintf(fp, "%d", ggml_get_i32_1d(node, j)); + } + else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) { + fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j)); + } + else { + fprintf(fp, "#"); + } + if (j < ggml_nelements(node) - 1) { + fprintf(fp, ", "); + } } - else { - fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0)); - } - } - else { - fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]); + fprintf(fp, ")"); } fprintf(fp, "\"; ]\n"); } @@ -17479,30 +17542,20 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph for (int i = 0; i < gb->n_nodes; i++) { struct ggml_tensor * node = gb->nodes[i]; - struct ggml_tensor * parent = ggml_graph_get_parent(gb, node); - if (node->src0) { - struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0); - - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"x\"; ]\n", - parent0 ? (void *) parent0 : (void *) node->src0, - parent0 ? "g" : "x", - parent ? (void *) parent : (void *) node, - parent ? "g" : "x", - parent ? "empty" : "vee", - parent ? "dashed" : "solid"); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->src0, "x"); } if (node->src1) { - struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->src1, "y"); + } - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ arrowhead = %s; style = %s; label = \"y\"; ]\n", - parent1 ? (void *) parent1 : (void *) node->src1, - parent1 ? "g" : "x", - parent ? (void *) parent : (void *) node, - parent ? "g" : "x", - parent ? "empty" : "vee", - parent ? "dashed" : "solid"); + for (int j = 0; j < GGML_MAX_OPT; j++) { + if (node->opt[j]) { + char label[16]; + snprintf(label, sizeof(label), "opt %d", j); + ggml_graph_dump_dot_node_edge(fp, gb, node, node->opt[j], label); + } } } @@ -17510,15 +17563,19 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph struct ggml_tensor * node = gb->leafs[i]; if (node->src0) { - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"x\"; ]\n", - (void *) node->src0, "x", - (void *) node, "x"); + ggml_graph_dump_dot_leaf_edge(fp, node, node->src0, "x"); } if (node->src1) { - fprintf(fp, " \"%p\":%s -> \"%p\":%s [ label = \"y\"; ]\n", - (void *) node->src1, "x", - (void *) node, "x"); + ggml_graph_dump_dot_leaf_edge(fp, node, node->src1, "y"); + } + + for (int j = 0; j < GGML_MAX_OPT; j++) { + if (node->opt[j]) { + char label[16]; + snprintf(label, sizeof(label), "opt %d", j); + ggml_graph_dump_dot_leaf_edge(fp, node, node->opt[j], label); + } } } diff --git a/ggml.h b/ggml.h index 18c78551f..4b6b72845 100644 --- a/ggml.h +++ b/ggml.h @@ -563,6 +563,7 @@ extern "C" { GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * name); + GGML_API struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...); // // operations on tensors with backpropagation From c943d823c14cef33092205ca3944de6fdf7abf99 Mon Sep 17 00:00:00 2001 From: AN Long Date: Sat, 24 Jun 2023 19:02:06 +0800 Subject: [PATCH 37/40] convert : fix invalid params in write_vocab_only (#1975) --- convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index de6c39c67..e340d2273 100644 --- a/convert.py +++ b/convert.py @@ -998,9 +998,9 @@ class OutputFile: def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: of = OutputFile(fname_out) params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, - n_head=1, n_layer=0, file_type=GGMLFileType.AllF32) + n_head=1, n_layer=0) of = OutputFile(fname_out) - of.write_file_header(params) + of.write_file_header(params, file_type=GGMLFileType.AllF32) of.write_vocab(vocab) of.fout.close() From fdd18609113862dc6eb34dfc44a093d54c59ff1f Mon Sep 17 00:00:00 2001 From: Rowan Hart Date: Sat, 24 Jun 2023 04:07:08 -0700 Subject: [PATCH 38/40] flake : fix ggml-metal.metal path and run nixfmt (#1974) --- flake.nix | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/flake.nix b/flake.nix index bba3d71f7..cebb47b94 100644 --- a/flake.nix +++ b/flake.nix @@ -9,27 +9,33 @@ inherit (pkgs.stdenv) isAarch64 isDarwin; inherit (pkgs.lib) optionals; isM1 = isAarch64 && isDarwin; - osSpecific = - if isM1 then with pkgs.darwin.apple_sdk_11_0.frameworks; [ Accelerate MetalKit MetalPerformanceShaders MetalPerformanceShadersGraph ] - else if isDarwin then with pkgs.darwin.apple_sdk.frameworks; [ Accelerate CoreGraphics CoreVideo ] - else [ ]; - pkgs = import nixpkgs { - inherit system; - }; - llama-python = pkgs.python310.withPackages (ps: with ps; [ - numpy - sentencepiece - ]); - in - { + osSpecific = if isM1 then + with pkgs.darwin.apple_sdk_11_0.frameworks; [ + Accelerate + MetalKit + MetalPerformanceShaders + MetalPerformanceShadersGraph + ] + else if isDarwin then + with pkgs.darwin.apple_sdk.frameworks; [ + Accelerate + CoreGraphics + CoreVideo + ] + else + [ ]; + pkgs = import nixpkgs { inherit system; }; + llama-python = + pkgs.python310.withPackages (ps: with ps; [ numpy sentencepiece ]); + in { packages.default = pkgs.stdenv.mkDerivation { name = "llama.cpp"; src = ./.; - postPatch = - if isM1 then '' - substituteInPlace ./ggml-metal.m \ - --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/ggml-metal.metal\";" - '' else ""; + postPatch = if isM1 then '' + substituteInPlace ./ggml-metal.m \ + --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" + '' else + ""; nativeBuildInputs = with pkgs; [ cmake ]; buildInputs = osSpecific; cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" ] ++ (optionals isM1 [ @@ -62,11 +68,7 @@ }; apps.default = self.apps.${system}.llama; devShells.default = pkgs.mkShell { - packages = with pkgs; [ - cmake - llama-python - ] ++ osSpecific; + packages = with pkgs; [ cmake llama-python ] ++ osSpecific; }; - } - ); + }); } From 65bdd52a867539691007f85c5508146d507f72c1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 24 Jun 2023 19:40:18 +0300 Subject: [PATCH 39/40] tests : sync test-grad0 from ggml --- tests/test-grad0.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test-grad0.c b/tests/test-grad0.c index c8c2c0f71..b5a499c1d 100644 --- a/tests/test-grad0.c +++ b/tests/test-grad0.c @@ -1,3 +1,4 @@ +#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #include "ggml.h" #include @@ -5,6 +6,10 @@ #include #include +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + #define MAX_NARGS 3 #undef MIN @@ -197,8 +202,23 @@ bool check_gradient( float max_error_abs, float max_error_rel) { + static int n_threads = -1; + if (n_threads < 0) { + n_threads = GGML_DEFAULT_N_THREADS; + + const char *env = getenv("GGML_N_THREADS"); + if (env) { + n_threads = atoi(env); + } + + printf("GGML_N_THREADS = %d\n", n_threads); + } + struct ggml_cgraph gf = ggml_build_forward (f); + gf.n_threads = n_threads; + struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false); + gb.n_threads = n_threads; ggml_graph_compute(ctx0, &gf); ggml_graph_reset (&gf); From 5ec8dd5a3c6a9a109351d2257bb9d53869bd0a94 Mon Sep 17 00:00:00 2001 From: Robyn Date: Sun, 25 Jun 2023 04:10:29 +1000 Subject: [PATCH 40/40] #1869 Fix null reference errors when training from scratch with CUDA (#1907) * #1869 Fix null reference errors when training from scratch with CUDA build Calling ggml_compute_forward when node->src0 was null was causing train-text-from-scratch.exe to terminate unexpectedly. * ggml : do not dereference src0 if NULL --------- Co-authored-by: Georgi Gerganov --- ggml-cuda.cu | 2 +- ggml.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 36a251ecc..010682edb 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2635,7 +2635,7 @@ void ggml_cuda_free_scratch() { bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ ggml_cuda_func_t func; const bool any_on_device = tensor->backend == GGML_BACKEND_GPU - || tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT + || (tensor->src0 != nullptr && (tensor->src0->backend == GGML_BACKEND_GPU || tensor->src0->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src1 != nullptr && tensor->src1->backend == GGML_BACKEND_GPU); switch (tensor->op) { diff --git a/ggml.c b/ggml.c index ef9e8585d..7104be01b 100644 --- a/ggml.c +++ b/ggml.c @@ -14911,7 +14911,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm if (skip_cpu) { return; } - GGML_ASSERT(tensor->src0->backend == GGML_BACKEND_CPU); + GGML_ASSERT(tensor->src0 == NULL || tensor->src0->backend == GGML_BACKEND_CPU); GGML_ASSERT(tensor->src1 == NULL || tensor->src1->backend == GGML_BACKEND_CPU); #endif // GGML_USE_CUBLAS