Merge branch 'master' into concedo_experimental

# Conflicts: # CMakeLists.txt # Makefile
2023-08-01 18:13:27 +08:00 · 2023-08-01 18:13:27 +08:00 · 84b28c4282
commit 84b28c4282
parent 46682e5cb3 86aeb27734
6 changed files with 1099 additions and 1096 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -48,7 +48,7 @@ set(LLAMA_CUDA_MMQ_Y       "64" CACHE STRING "llama: y tile size for mmq CUDA ke
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_DMMV_Y       "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
-option(LLAMA_CUDA_DMMV_F16                   "llama: use 16 bit floats for dmmv CUDA kernels"   OFF)
+option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for dmmv CUDA kernels"   OFF)
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
@ -87,7 +87,7 @@ if (LLAMA_CUBLAS)
        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
        add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
-        if (LLAMA_CUDA_DMMV_F16)
+        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
            add_compile_definitions(GGML_CUDA_F16)
        endif()
        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@ -103,7 +103,7 @@ if (LLAMA_CUBLAS)
        # 60 == f16 CUDA intrinsics
        # 61 == integer CUDA intrinsics
        # 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
-        if (LLAMA_CUDA_DMMV_F16)
+        if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
            set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
        else()
            set(CMAKE_CUDA_ARCHITECTURES "37;52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
--- a/8
+++ b/8
@ -42,7 +42,7 @@ endif
 # keep standard at C11 and C++11
 CFLAGS   = -I.              -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c11   -fPIC -DGGML_USE_K_QUANTS
-CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -O3 -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
+CXXFLAGS = -I. -I./examples -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DGGML_USE_K_QUANTS
 LDFLAGS  =
 # these are used on windows, to build some libraries with extra old device compatibility
@ -188,11 +188,11 @@ ifdef LLAMA_CUDA_CCBIN
 	NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
 endif
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
-	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
+	$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
 ggml_v2-cuda.o: otherarch/ggml_v2-cuda.cu otherarch/ggml_v2-cuda.h
-	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
+	$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
 ggml_v2-cuda-legacy.o: otherarch/ggml_v2-cuda-legacy.cu otherarch/ggml_v2-cuda-legacy.h
-	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
+	$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) $(CUBLAS_FLAGS) $(CUBLAS_CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif # LLAMA_CUBLAS
 ifdef LLAMA_METAL
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -3,12 +3,11 @@
 <head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
  <meta name="color-scheme" content="light dark">
  <title>llama.cpp - chat</title>
  <style>
    body {
      background-color: #fff;
      color: #000;
      font-family: system-ui;
      font-size: 90%;
    }
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -718,7 +718,8 @@ void ggml_metal_graph_compute(
                            // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
                            GGML_ASSERT(ne00 == ne10);
-                            GGML_ASSERT(ne02 == ne12);
+                            // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
                            GGML_ASSERT(ne03 == ne13);
                            if (ggml_is_contiguous(src0) &&
                                ggml_is_contiguous(src1) &&
@ -746,11 +747,11 @@ void ggml_metal_graph_compute(
                                    initWithDevice:ctx->device transposeLeft:false transposeRight:true
                                        resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
-                                // we need to do ne02 multiplications
+                                // we need to do ne12 multiplications
                                // TODO: is there a way to do this in parallel - currently very slow ..
                                // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
-                                for (int64_t i02 = 0; i02 < ne02; ++i02) {
+                                for (int64_t i02 = 0; i02 < ne12; ++i02) {
-                                    size_t offs_src0_cur = offs_src0 + i02*nb02;
+                                    size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
                                    size_t offs_src1_cur = offs_src1 + i02*nb12;
                                    size_t offs_dst_cur  = offs_dst  + i02*nb2;
@ -772,8 +773,6 @@ void ggml_metal_graph_compute(
                                switch (src0t) {
                                    case GGML_TYPE_F16:
                                        {
                                            GGML_ASSERT(ne02 == ne12);
                                            nth0 = 64;
                                            nth1 = 1;
                                            [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
@ -853,16 +852,18 @@ void ggml_metal_graph_compute(
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
                                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
                                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
+                                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
+                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
+                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
+                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
+                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
-                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
+                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
-                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
+                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
-                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
+                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
-                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:13];
+                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
-                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:14];
+                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
                                    src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
        device       float * dst,
        constant   int64_t & ne00,
        constant   int64_t & ne01,
        constant   int64_t & ne02,
        constant  uint64_t & nb00,
        constant  uint64_t & nb01,
        constant  uint64_t & nb02,
        constant   int64_t & ne10,
        constant   int64_t & ne11,
        constant   int64_t & ne12,
        constant  uint64_t & nb10,
        constant  uint64_t & nb11,
        constant  uint64_t & nb12,
@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
    const int64_t r1 = tgpig.y;
    const int64_t im = tgpig.z;
-    device const half  * x = (device const half  *) (src0 + r0*nb01 + im*nb02);
+    device const half  * x = (device const half  *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
    device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
    sum[tpitg.x] = 0.0f;
@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
    }
 }
 kernel void kernel_alibi_f32(
        device const float * src0,
        device       float * dst,