update

2023-11-17 14:57:52 -05:00 · 2023-11-17 14:57:52 -05:00 · b4680d7cd3
commit b4680d7cd3
parent 333f704f74
5 changed files with 20 additions and 14 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -22,6 +22,9 @@ set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
 set(CUDACXX /usr/local/cuda-12.2/bin/nvcc)
 #GGML_USE_CUBLAS
 #set(CMAKE_EXE_LINKER_FLAGS -pg)
 #set(CMAKE_SHARED_LINKER_FLAGS -pg)
 set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
@ -447,8 +450,8 @@ if (LLAMA_ALL_WARNINGS)
        # todo : msvc
    endif()
-    set(c_flags   ${c_flags} -save-temps --verbose  ${warning_flags})
+    set(c_flags   ${c_flags}  -save-temps --verbose  ${warning_flags})
-    set(cxx_flags ${cxx_flags} -save-temps --verbose ${warning_flags})
+    set(cxx_flags ${cxx_flags}  -save-temps --verbose ${warning_flags})
    add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
                        "$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
                        "$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
@ -506,7 +509,7 @@ if (NOT MSVC)
    add_link_options("-Wl,-Map=${TARGET}.map")
    if (LLAMA_GPROF)
-        add_compile_options(-pg)
+      add_compile_options(-pg)
    endif()
 endif()
--- a/README.md
+++ b/README.md
@ -583,7 +583,7 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
 ### Memory/Disk Requirements
-	As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
+As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
 | Model | Original size | Quantized size (4-bit) |
 |------:|--------------:|-----------------------:|
@ -687,7 +687,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
 The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
-For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
+For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets ygou write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
 ### Instruction mode with Alpaca
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -824,8 +824,8 @@ int main(int argc, char ** argv) {
    llama_print_timings(ctx);
    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
-    // dump core
+    // dont dump core
-    int *ptr = 0; *ptr = 1;
+    //int *ptr = 0; *ptr = 1;
    if (ctx_guidance) { llama_free(ctx_guidance); }
    llama_free(ctx);
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -5,7 +5,7 @@
 #include <string.h>
 #include <assert.h>
 #include <float.h>
-
+#include <stdio.h>
 #ifdef __ARM_NEON
 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
@ -2419,6 +2419,7 @@ static inline __m128i get_scale_shuffle(int i) {
 #endif
 void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
  //fprintf(stderr, "%s: n:%d s:%f vx:%p vy:%p\n", __func__, n,*s, vx, vy);
    const int qk = QK8_0;
    const int nb = n / qk;
--- a/ggml.c
+++ b/ggml.c
@ -6365,7 +6365,7 @@ static void ggml_compute_forward_dup_f16(
                GGML_ASSERT(false); // TODO: implement
            }
        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
+	  printf("%s: this is not optimal - fix me\n", __func__);
            if (dst->type == GGML_TYPE_F32) {
                size_t id = 0;
@ -6612,7 +6612,7 @@ static void ggml_compute_forward_dup_f32(
                GGML_ASSERT(false); // TODO: implement
            }
        } else {
-            //printf("%s: this is not optimal - fix me\n", __func__);
+	  printf("%s: this is not optimal - fix me\n", __func__);
            if (dst->type == GGML_TYPE_F32) {
                size_t id = 0;
@ -9390,6 +9390,7 @@ static void ggml_compute_forward_mul_mat(
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
              struct ggml_tensor * dst) {
    int64_t t0 = ggml_perf_time_us();
    UNUSED(t0);
@ -9427,7 +9428,8 @@ static void ggml_compute_forward_mul_mat(
    // nb01 >= nb00 - src0 is not transposed
    //   compute by src0 rows
-
+    fprintf(stderr, "%s: params_type:%d src0:%p ->data %p src1:%p ->data %p\n", __func__, params->type,  (void*)src0, src0->data, (void*)src1, src1->data);
 #if defined(GGML_USE_CLBLAST)
    if (ggml_cl_can_mul_mat(src0, src1, dst)) {
        if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
@ -9484,7 +9486,7 @@ static void ggml_compute_forward_mul_mat(
            }
        }
-        //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
+        printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
        return;
    }
@ -9518,7 +9520,7 @@ static void ggml_compute_forward_mul_mat(
    const int64_t nr0 = ne01;           // src0 rows
    const int64_t nr1 = ne11*ne12*ne13; // src1 rows
-    //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
+    printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
    // distribute the thread work across the inner or outer loop based on which one is larger
@ -9537,7 +9539,7 @@ static void ggml_compute_forward_mul_mat(
    const int64_t ir110 = dr1*ith1;
    const int64_t ir111 = MIN(ir110 + dr1, nr1);
-    //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
+    printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
    // threads with no work simply yield (not sure if it helps)
    if (ir010 >= ir011 || ir110 >= ir111) {