Merge 'origin/master' into hipblas

This commit is contained in:
Henri Vasserman 2023-04-30 18:40:42 +03:00
commit c73def129a
No known key found for this signature in database
GPG key ID: 2995FC0F58B1A986
11 changed files with 145 additions and 91 deletions

2
.gitignore vendored
View file

@ -28,7 +28,7 @@ models/*
/result
/perplexity
/embedding
/benchmark-q4_0-matmult
/benchmark-matmult
/vdot
/Pipfile

View file

@ -367,7 +367,7 @@ endif()
add_library(llama
llama.cpp
llama.h
llama_util.h)
llama-util.h)
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump

View file

@ -34,10 +34,15 @@ endif
#
# keep standard at C11 and C++11
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
CFLAGS = -I. -O3 -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
LDFLAGS =
ifndef LLAMA_DEBUG
CFLAGS += -DNDEBUG
CXXFLAGS += -DNDEBUG
endif
# warnings
CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
@ -181,14 +186,14 @@ $(info )
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h
$(CXX) $(CXXFLAGS) -c $< -o $@
common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c $< -o $@
clean:
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult
main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@ -218,9 +223,9 @@ libllama.so: llama.o ggml.o $(OBJS)
# Tests
#
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
./benchmark-q4_0-matmult
benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
./$@
.PHONY: tests
tests:

View file

@ -35,4 +35,5 @@ else()
add_subdirectory(perplexity)
add_subdirectory(embedding)
add_subdirectory(save-load-state)
add_subdirectory(benchmark)
endif()

View file

@ -0,0 +1,4 @@
set(TARGET benchmark)
add_executable(${TARGET} benchmark-matmult.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,11 +1,3 @@
/*
License: MIT License
Changelog:
- 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
*/
#include <locale.h>
#include "ggml.h"
#include <assert.h>
@ -45,7 +37,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) {
#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5ld x %5ld x %5ld, nb = (%5li, %5li, %5li) - ", #TENSOR, \
TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
{ float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
@ -98,12 +90,9 @@ int main(int argc, char ** argv) {
}
}
// create the ggml context
printf("Starting Test\n");
struct ggml_context * ctx;
//const int sizex = 4096;
//const int sizey = 11008;
@ -125,16 +114,18 @@ int main(int argc, char ** argv) {
#endif
//printf("Memsize required = %i\n", sizex*sizex);
ggml_type wtype = GGML_TYPE_F32;
size_t ctx_size = 0;
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizeof(float);
ctx_size += 1024*1024*100;
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
ctx_size += 1024*1024*16;
printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024));
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
@ -217,7 +208,7 @@ int main(int argc, char ** argv) {
const int dimz = sizez;
long long int flops_per_dot_product = dimy + dimy;
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
// Let's use the F32 result from above as a reference for the q4_0 multiplication
@ -234,7 +225,6 @@ int main(int argc, char ** argv) {
ggml_graph_compute(ctx, &gf31);
long long int stop = ggml_time_us();
long long int usec = stop-start;
float sec = usec/1000000;
float flops_per_usec = (1.0f*flops_per_matrix)/usec;
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
i,

View file

@ -1,13 +1,10 @@
#include "common.h"
#include "llama.h"
#include <vector>
#include <cstdio>
#include <chrono>
#include "common.h"
#include "llama.h"
#include "llama.cpp"
using namespace std;
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
@ -20,21 +17,25 @@ int main(int argc, char ** argv) {
return 1;
}
if (params.n_predict < 0) {
params.n_predict = 16;
}
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap;
lparams.use_mlock = params.use_mlock;
auto n_past = 0;
auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
auto last_n_tokens_data = std::vector<llama_token>(params.repeat_last_n, 0);
// init
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
auto tokens = vector<llama_token>(params.n_ctx);
auto tokens = std::vector<llama_token>(params.n_ctx);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
if (n_prompt_tokens < 1) {
@ -43,26 +44,29 @@ int main(int argc, char ** argv) {
}
// evaluate prompt
llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
n_past += n_prompt_tokens;
const size_t state_size = llama_get_state_size(ctx);
uint8_t * state_mem = new uint8_t[state_size];
// Save state (rng, logits, embedding and kv_cache) to file
FILE *fp_write = fopen("dump_state.bin", "wb");
auto state_size = llama_get_state_size(ctx);
auto state_mem = new uint8_t[state_size];
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
fwrite(state_mem, 1, state_size, fp_write);
fclose(fp_write);
{
FILE *fp_write = fopen("dump_state.bin", "wb");
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
fwrite(state_mem, 1, state_size, fp_write);
fclose(fp_write);
}
// save state (last tokens)
auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
auto n_past_saved = n_past;
const auto last_n_tokens_data_saved = std::vector<llama_token>(last_n_tokens_data);
const auto n_past_saved = n_past;
// first run
printf("\n%s", params.prompt.c_str());
for (auto i = 0; i < params.n_predict; i++) {
auto logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
@ -75,6 +79,7 @@ int main(int argc, char ** argv) {
auto next_token = llama_sample_token(ctx, &candidates_p);
auto next_token_str = llama_token_to_str(ctx, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str);
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
@ -82,24 +87,34 @@ int main(int argc, char ** argv) {
}
n_past += 1;
}
printf("\n\n");
// free old model
llama_free(ctx);
// load new model
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
// Load state (rng, logits, embedding and kv_cache) from file
FILE *fp_read = fopen("dump_state.bin", "rb");
auto state_size2 = llama_get_state_size(ctx2);
if (state_size != state_size2) {
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
{
FILE *fp_read = fopen("dump_state.bin", "rb");
if (state_size != llama_get_state_size(ctx2)) {
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
return 1;
}
const size_t ret = fread(state_mem, 1, state_size, fp_read);
if (ret != state_size) {
fprintf(stderr, "\n%s : failed to read state\n", __func__);
return 1;
}
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
fclose(fp_read);
}
fread(state_mem, 1, state_size, fp_read);
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
fclose(fp_read);
delete[] state_mem;
// restore state (last tokens)
last_n_tokens_data = last_n_tokens_data_saved;
@ -118,6 +133,7 @@ int main(int argc, char ** argv) {
auto next_token = llama_sample_token(ctx2, &candidates_p);
auto next_token_str = llama_token_to_str(ctx2, next_token);
last_n_tokens_data.push_back(next_token);
printf("%s", next_token_str);
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
@ -125,6 +141,8 @@ int main(int argc, char ** argv) {
}
n_past += 1;
}
printf("\n\n");
return 0;
}

88
ggml.c
View file

@ -668,6 +668,33 @@ uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
return vget_high_u8(vcombine_u8(a, b));
}
int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
return vcombine_s8(vget_low_s8(a), vget_low_s8(b));
}
int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
return vcombine_s8(vget_high_s8(a), vget_high_s8(b));
}
uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
return vcombine_u8(vget_low_u8(a), vget_low_u8(b));
}
uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
return vcombine_u8(vget_high_u8(a), vget_high_u8(b));
}
int32x4_t vcvtnq_s32_f32(float32x4_t v) {
int32x4_t res;
res[0] = roundf(vgetq_lane_f32(v, 0));
res[1] = roundf(vgetq_lane_f32(v, 1));
res[2] = roundf(vgetq_lane_f32(v, 2));
res[3] = roundf(vgetq_lane_f32(v, 3));
return res;
}
#endif
#endif
@ -2658,35 +2685,35 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
// interleave
const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
// load y
const int8x16_t v1_0l = vld1q_s8(y0->qs);
const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
const int8x16_t v1_1l = vld1q_s8(y1->qs);
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
// interleave
const int8x16_t v1_0ls = vuzp1q_s8(v1_0l, v1_0h);
const int8x16_t v1_0hs = vuzp2q_s8(v1_0l, v1_0h);
const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h);
const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h);
#if defined(__ARM_FEATURE_DOTPROD)
// dot product into int32x4_t
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls), v0_0hs, v1_0hs);
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls), v0_1hs, v1_1hs);
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
#else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
@ -3800,6 +3827,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"DIAG_MASK_INF",
"SOFT_MAX",
"ROPE",
"ALIBI",
"CONV_1D_1S",
"CONV_1D_2S",
@ -3848,6 +3876,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"diag_mask_inf(x)",
"soft_max(x)",
"rope(x)",
"alibi(x)",
"conv_1d_1s(x)",
"conv_1d_2s(x)",
@ -8245,8 +8274,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
#else
float * const wdata = params->wdata;
#endif
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -8263,8 +8290,11 @@ static void ggml_compute_forward_mul_mat_f16_f32(
wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
}
}
assert(id*sizeof(ggml_fp16_t) <= params->wsize);
}
#else
float * const wdata = params->wdata;
{
size_t id = 0;
for (int64_t i01 = 0; i01 < ne01; ++i01) {
@ -8272,6 +8302,8 @@ static void ggml_compute_forward_mul_mat_f16_f32(
wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
}
}
assert(id*sizeof(float) <= params->wsize);
}
#endif
@ -8537,7 +8569,10 @@ static void ggml_compute_forward_mul_mat_q_f32(
dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
id += ne00;
}
assert(id*sizeof(float) <= params->wsize);
}
const float * x = wdata;
#endif
@ -9118,7 +9153,7 @@ static void ggml_compute_forward_alibi_f32(
//const int nb3 = src0->nb[3];
assert(nb0 == sizeof(float));
assert(ne1+n_past == ne0);
assert(ne1 + n_past == ne0); (void) n_past;
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@ -9179,7 +9214,7 @@ static void ggml_compute_forward_alibi_f16(
//const int nb3 = src0->nb[3];
assert(nb0 == sizeof(ggml_fp16_t));
assert(ne1+n_past == ne0);
assert(ne1 + n_past == ne0); (void) n_past;
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
@ -11571,10 +11606,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1; // TODO: this actually is doing nothing
// the threads are still spinning
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*MAX(ggml_nelements(node->src1), ggml_nelements(node->src0));
//printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
//printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
//printf("cur = %zu\n", cur);
#if defined(GGML_USE_CUBLAS)
// with cuBLAS, we need memory for the full 3D / 4D data of src1
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
#else
// here we need memory just for single 2D matrix from src0
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
#endif
} else {
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
}
@ -11583,7 +11621,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
#endif
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
cur = 0;
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
node->n_tasks = 1;
}

4
ggml.h
View file

@ -701,8 +701,8 @@ extern "C" {
struct ggml_tensor * c1);
// Mapping operations
GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
GGML_API struct ggml_tensor * ggml_map_unary_f32(
struct ggml_context * ctx,

1
llama_util.h → llama-util.h Executable file → Normal file
View file

@ -430,5 +430,4 @@ struct llama_ctx_buffer {
typedef llama_buffer llama_ctx_buffer;
#endif
#endif

View file

@ -5,7 +5,7 @@
#include <cstdio>
#endif
#include "llama_util.h"
#include "llama-util.h"
#include "llama.h"
#include "ggml.h"
@ -33,7 +33,6 @@
#define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16
// available llama models
enum e_model {
MODEL_UNKNOWN,
@ -781,7 +780,7 @@ static bool kv_cache_init(
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int64_t n_mem = (int64_t)n_layer*n_ctx;
const int64_t n_mem = n_layer*n_ctx;
const int64_t n_elements = n_embd*n_mem;
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);