update
This commit is contained in:
parent
333f704f74
commit
b4680d7cd3
5 changed files with 20 additions and 14 deletions
|
@ -22,6 +22,9 @@ set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||||
set(CUDACXX /usr/local/cuda-12.2/bin/nvcc)
|
set(CUDACXX /usr/local/cuda-12.2/bin/nvcc)
|
||||||
#GGML_USE_CUBLAS
|
#GGML_USE_CUBLAS
|
||||||
|
|
||||||
|
#set(CMAKE_EXE_LINKER_FLAGS -pg)
|
||||||
|
#set(CMAKE_SHARED_LINKER_FLAGS -pg)
|
||||||
|
|
||||||
set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
|
set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
|
||||||
|
|
||||||
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||||
|
@ -447,8 +450,8 @@ if (LLAMA_ALL_WARNINGS)
|
||||||
# todo : msvc
|
# todo : msvc
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(c_flags ${c_flags} -save-temps --verbose ${warning_flags})
|
set(c_flags ${c_flags} -save-temps --verbose ${warning_flags})
|
||||||
set(cxx_flags ${cxx_flags} -save-temps --verbose ${warning_flags})
|
set(cxx_flags ${cxx_flags} -save-temps --verbose ${warning_flags})
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
||||||
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
|
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
|
||||||
"$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
|
"$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
|
||||||
|
@ -506,7 +509,7 @@ if (NOT MSVC)
|
||||||
add_link_options("-Wl,-Map=${TARGET}.map")
|
add_link_options("-Wl,-Map=${TARGET}.map")
|
||||||
|
|
||||||
if (LLAMA_GPROF)
|
if (LLAMA_GPROF)
|
||||||
add_compile_options(-pg)
|
add_compile_options(-pg)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|
|
@ -583,7 +583,7 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver
|
||||||
|
|
||||||
### Memory/Disk Requirements
|
### Memory/Disk Requirements
|
||||||
|
|
||||||
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
|
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
|
||||||
|
|
||||||
| Model | Original size | Quantized size (4-bit) |
|
| Model | Original size | Quantized size (4-bit) |
|
||||||
|------:|--------------:|-----------------------:|
|
|------:|--------------:|-----------------------:|
|
||||||
|
@ -687,7 +687,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
||||||
|
|
||||||
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||||
|
|
||||||
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets ygou write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
||||||
|
|
||||||
### Instruction mode with Alpaca
|
### Instruction mode with Alpaca
|
||||||
|
|
||||||
|
|
|
@ -824,8 +824,8 @@ int main(int argc, char ** argv) {
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
// dump core
|
// dont dump core
|
||||||
int *ptr = 0; *ptr = 1;
|
//int *ptr = 0; *ptr = 1;
|
||||||
|
|
||||||
if (ctx_guidance) { llama_free(ctx_guidance); }
|
if (ctx_guidance) { llama_free(ctx_guidance); }
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <float.h>
|
#include <float.h>
|
||||||
|
#include <stdio.h>
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||||
|
@ -2419,6 +2419,7 @@ static inline __m128i get_scale_shuffle(int i) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
|
||||||
|
//fprintf(stderr, "%s: n:%d s:%f vx:%p vy:%p\n", __func__, n,*s, vx, vy);
|
||||||
const int qk = QK8_0;
|
const int qk = QK8_0;
|
||||||
const int nb = n / qk;
|
const int nb = n / qk;
|
||||||
|
|
||||||
|
|
14
ggml.c
14
ggml.c
|
@ -6365,7 +6365,7 @@ static void ggml_compute_forward_dup_f16(
|
||||||
GGML_ASSERT(false); // TODO: implement
|
GGML_ASSERT(false); // TODO: implement
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//printf("%s: this is not optimal - fix me\n", __func__);
|
printf("%s: this is not optimal - fix me\n", __func__);
|
||||||
|
|
||||||
if (dst->type == GGML_TYPE_F32) {
|
if (dst->type == GGML_TYPE_F32) {
|
||||||
size_t id = 0;
|
size_t id = 0;
|
||||||
|
@ -6612,7 +6612,7 @@ static void ggml_compute_forward_dup_f32(
|
||||||
GGML_ASSERT(false); // TODO: implement
|
GGML_ASSERT(false); // TODO: implement
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//printf("%s: this is not optimal - fix me\n", __func__);
|
printf("%s: this is not optimal - fix me\n", __func__);
|
||||||
|
|
||||||
if (dst->type == GGML_TYPE_F32) {
|
if (dst->type == GGML_TYPE_F32) {
|
||||||
size_t id = 0;
|
size_t id = 0;
|
||||||
|
@ -9390,6 +9390,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst) {
|
||||||
|
|
||||||
int64_t t0 = ggml_perf_time_us();
|
int64_t t0 = ggml_perf_time_us();
|
||||||
UNUSED(t0);
|
UNUSED(t0);
|
||||||
|
|
||||||
|
@ -9427,7 +9428,8 @@ static void ggml_compute_forward_mul_mat(
|
||||||
|
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
fprintf(stderr, "%s: params_type:%d src0:%p ->data %p src1:%p ->data %p\n", __func__, params->type, (void*)src0, src0->data, (void*)src1, src1->data);
|
||||||
|
|
||||||
#if defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
if (ggml_cl_can_mul_mat(src0, src1, dst)) {
|
||||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||||
|
@ -9484,7 +9486,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -9518,7 +9520,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
const int64_t nr0 = ne01; // src0 rows
|
const int64_t nr0 = ne01; // src0 rows
|
||||||
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
const int64_t nr1 = ne11*ne12*ne13; // src1 rows
|
||||||
|
|
||||||
//printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
|
||||||
|
|
||||||
// distribute the thread work across the inner or outer loop based on which one is larger
|
// distribute the thread work across the inner or outer loop based on which one is larger
|
||||||
|
|
||||||
|
@ -9537,7 +9539,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
const int64_t ir110 = dr1*ith1;
|
const int64_t ir110 = dr1*ith1;
|
||||||
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
const int64_t ir111 = MIN(ir110 + dr1, nr1);
|
||||||
|
|
||||||
//printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111);
|
||||||
|
|
||||||
// threads with no work simply yield (not sure if it helps)
|
// threads with no work simply yield (not sure if it helps)
|
||||||
if (ir010 >= ir011 || ir110 >= ir111) {
|
if (ir010 >= ir011 || ir110 >= ir111) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue