This commit is contained in:
mike dupont 2023-11-17 14:57:52 -05:00
parent 333f704f74
commit b4680d7cd3
5 changed files with 20 additions and 14 deletions

View file

@ -22,6 +22,9 @@ set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
set(CUDACXX /usr/local/cuda-12.2/bin/nvcc)
#GGML_USE_CUBLAS
#set(CMAKE_EXE_LINKER_FLAGS -pg)
#set(CMAKE_SHARED_LINKER_FLAGS -pg)
set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
@ -447,8 +450,8 @@ if (LLAMA_ALL_WARNINGS)
# todo : msvc
endif()
set(c_flags ${c_flags} -save-temps --verbose ${warning_flags})
set(cxx_flags ${cxx_flags} -save-temps --verbose ${warning_flags})
set(c_flags ${c_flags} -save-temps --verbose ${warning_flags})
set(cxx_flags ${cxx_flags} -save-temps --verbose ${warning_flags})
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
"$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
@ -506,7 +509,7 @@ if (NOT MSVC)
add_link_options("-Wl,-Map=${TARGET}.map")
if (LLAMA_GPROF)
add_compile_options(-pg)
add_compile_options(-pg)
endif()
endif()

View file

@ -583,7 +583,7 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
### Memory/Disk Requirements
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
| Model | Original size | Quantized size (4-bit) |
|------:|--------------:|-----------------------:|
@ -687,7 +687,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets ygou write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
### Instruction mode with Alpaca

View file

@ -824,8 +824,8 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
// dump core
int *ptr = 0; *ptr = 1;
// dont dump core
//int *ptr = 0; *ptr = 1;
if (ctx_guidance) { llama_free(ctx_guidance); }
llama_free(ctx);

View file

@ -5,7 +5,7 @@
#include <string.h>
#include <assert.h>
#include <float.h>
#include <stdio.h>
#ifdef __ARM_NEON
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
@ -2419,6 +2419,7 @@ static inline __m128i get_scale_shuffle(int i) {
#endif
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
//fprintf(stderr, "%s: n:%d s:%f vx:%p vy:%p\n", __func__, n,*s, vx, vy);
const int qk = QK8_0;
const int nb = n / qk;

14
ggml.c
View file

@ -6365,7 +6365,7 @@ static void ggml_compute_forward_dup_f16(
GGML_ASSERT(false); // TODO: implement
}
} else {
//printf("%s: this is not optimal - fix me\n", __func__);
printf("%s: this is not optimal - fix me\n", __func__);
if (dst->type == GGML_TYPE_F32) {
size_t id = 0;
@ -6612,7 +6612,7 @@ static void ggml_compute_forward_dup_f32(
GGML_ASSERT(false); // TODO: implement
}
} else {
//printf("%s: this is not optimal - fix me\n", __func__);
printf("%s: this is not optimal - fix me\n", __func__);
if (dst->type == GGML_TYPE_F32) {
size_t id = 0;
@ -9390,6 +9390,7 @@ static void ggml_compute_forward_mul_mat(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
@ -9427,7 +9428,8 @@ static void ggml_compute_forward_mul_mat(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
fprintf(stderr, "%s: params_type:%d src0:%p ->data %p src1:%p ->data %p\n", __func__, params->type, (void*)src0, src0->data, (void*)src1, src1->data);
#if defined(GGML_USE_CLBLAST)
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
@ -9484,7 +9486,7 @@ static void ggml_compute_forward_mul_mat(
}
}
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
return;
}
@ -9518,7 +9520,7 @@ static void ggml_compute_forward_mul_mat(
const int64_t nr0 = ne01; // src0 rows
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
// distribute the thread work across the inner or outer loop based on which one is larger
@ -9537,7 +9539,7 @@ static void ggml_compute_forward_mul_mat(
const int64_t ir110 = dr1*ith1;
const int64_t ir111 = MIN(ir110 + dr1, nr1);
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
// threads with no work simply yield (not sure if it helps)
if (ir010 >= ir011 || ir110 >= ir111) {