Merge branch 'master' of https://github.com/ggerganov/llama.cpp into jon/use-hardware-cores
This commit is contained in:
commit
b17d54eda3
19 changed files with 501 additions and 120 deletions
|
@ -14,3 +14,6 @@ indent_size = 4
|
|||
|
||||
[Makefile]
|
||||
indent_style = tab
|
||||
|
||||
[prompts/*.txt]
|
||||
insert_final_newline = unset
|
||||
|
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -23,6 +23,7 @@ models/*
|
|||
/result
|
||||
/perplexity
|
||||
/embedding
|
||||
/benchmark-q4_0-matmult
|
||||
/Pipfile
|
||||
|
||||
arm_neon.h
|
||||
|
|
|
@ -56,6 +56,10 @@ option(LLAMA_AVX "llama: enable AVX"
|
|||
option(LLAMA_AVX2 "llama: enable AVX2" ON)
|
||||
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
|
||||
option(LLAMA_FMA "llama: enable FMA" ON)
|
||||
# in MSVC F16C is implied with AVX2/AVX512
|
||||
if (NOT MSVC)
|
||||
option(LLAMA_F16C "llama: enable F16C" ON)
|
||||
endif()
|
||||
|
||||
# 3rd party libs
|
||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||
|
@ -207,7 +211,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
|
|||
add_compile_options(/arch:AVX)
|
||||
endif()
|
||||
else()
|
||||
add_compile_options(-mf16c)
|
||||
if (LLAMA_F16C)
|
||||
add_compile_options(-mf16c)
|
||||
endif()
|
||||
if (LLAMA_FMA)
|
||||
add_compile_options(-mfma)
|
||||
endif()
|
||||
|
@ -247,7 +253,6 @@ endif()
|
|||
add_library(llama
|
||||
llama.cpp
|
||||
llama.h
|
||||
llama_internal.h
|
||||
llama_util.h)
|
||||
|
||||
target_include_directories(llama PUBLIC .)
|
||||
|
|
9
Makefile
9
Makefile
|
@ -142,14 +142,14 @@ default: main quantize perplexity embedding
|
|||
ggml.o: ggml.c ggml.h
|
||||
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
|
||||
|
||||
llama.o: llama.cpp llama.h llama_util.h llama_internal.h
|
||||
llama.o: llama.cpp llama.h llama_util.h
|
||||
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
|
||||
|
||||
common.o: examples/common.cpp examples/common.h
|
||||
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
|
||||
|
||||
clean:
|
||||
rm -vf *.o main quantize quantize-stats perplexity embedding
|
||||
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
|
||||
|
||||
main: examples/main/main.cpp ggml.o llama.o common.o
|
||||
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
|
||||
|
@ -171,10 +171,15 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
|
|||
|
||||
libllama.so: llama.o ggml.o
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
|
||||
|
||||
#
|
||||
# Tests
|
||||
#
|
||||
|
||||
benchmark: ggml.o
|
||||
$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)
|
||||
./benchmark-q4_0-matmult
|
||||
|
||||
.PHONY: tests
|
||||
tests:
|
||||
bash ./tests/run-tests.sh
|
||||
|
|
43
README.md
43
README.md
|
@ -49,6 +49,7 @@ New features will probably be added mostly through community contributions.
|
|||
|
||||
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
|
||||
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
|
||||
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
|
||||
|
||||
**UI:**
|
||||
|
||||
|
@ -149,21 +150,43 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
|
|||
|
||||
## Usage
|
||||
|
||||
Here are the step for the LLaMA-7B model:
|
||||
Here are the step for the LLaMA-7B model.
|
||||
|
||||
### Get the Code
|
||||
|
||||
```bash
|
||||
# build this repo
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
make
|
||||
```
|
||||
|
||||
#For Windows and CMake, use the following command instead:
|
||||
cd <path_to_llama_folder>
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
cmake --build . --config Release
|
||||
### Build
|
||||
|
||||
Note: For Windows, CMake or Zig can be used.
|
||||
|
||||
1. Use `make`
|
||||
|
||||
```bash
|
||||
make
|
||||
```
|
||||
|
||||
1. Use CMake
|
||||
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
cmake ..
|
||||
cmake --build . --config Release
|
||||
```
|
||||
|
||||
1. Use Zig
|
||||
|
||||
```bash
|
||||
zig build -Drelease-fast
|
||||
```
|
||||
|
||||
### Prepare Data & Run
|
||||
|
||||
```bash
|
||||
# obtain the original LLaMA model weights and place them in ./models
|
||||
ls ./models
|
||||
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
|
||||
|
@ -181,8 +204,6 @@ python3 convert-pth-to-ggml.py models/7B/ 1
|
|||
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||
```
|
||||
|
||||
Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11.
|
||||
|
||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||
|
||||
### Memory/Disk Requirements
|
||||
|
|
22
build.zig
22
build.zig
|
@ -1,16 +1,14 @@
|
|||
const std = @import("std");
|
||||
|
||||
pub fn build(b: *std.Build) void {
|
||||
pub fn build(b: *std.build.Builder) void {
|
||||
const target = b.standardTargetOptions(.{});
|
||||
const optimize = b.standardOptimizeOption(.{});
|
||||
const optimize = b.standardReleaseOptions();
|
||||
const want_lto = b.option(bool, "lto", "Want -fLTO");
|
||||
|
||||
const lib = b.addStaticLibrary(.{
|
||||
.name = "llama",
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
const lib = b.addStaticLibrary("llama", null);
|
||||
lib.want_lto = want_lto;
|
||||
lib.setTarget(target);
|
||||
lib.setBuildMode(optimize);
|
||||
lib.linkLibCpp();
|
||||
lib.addIncludePath(".");
|
||||
lib.addIncludePath("examples");
|
||||
|
@ -44,16 +42,12 @@ pub fn build(b: *std.Build) void {
|
|||
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
|
||||
const b = args.b;
|
||||
const lib = args.lib;
|
||||
const target = args.target;
|
||||
const optimize = args.optimize;
|
||||
const want_lto = args.want_lto;
|
||||
|
||||
const exe = b.addExecutable(.{
|
||||
.name = name,
|
||||
.target = target,
|
||||
.optimize = optimize,
|
||||
});
|
||||
const exe = b.addExecutable(name, null);
|
||||
exe.want_lto = want_lto;
|
||||
lib.setTarget(args.target);
|
||||
lib.setBuildMode(args.optimize);
|
||||
exe.addIncludePath(".");
|
||||
exe.addIncludePath("examples");
|
||||
exe.addCSourceFiles(&.{
|
||||
|
|
|
@ -7,4 +7,4 @@
|
|||
cd `dirname $0`
|
||||
cd ..
|
||||
|
||||
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
|
||||
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
|
||||
|
|
270
examples/benchmark/benchmark-q4_0-matmult.c
Normal file
270
examples/benchmark/benchmark-q4_0-matmult.c
Normal file
|
@ -0,0 +1,270 @@
|
|||
/*
|
||||
License: MIT License
|
||||
|
||||
Changelog:
|
||||
- 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
|
||||
|
||||
*/
|
||||
|
||||
#include <locale.h>
|
||||
#include "ggml.h"
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <cstring>
|
||||
#include <cstdio>
|
||||
#include <cinttypes>
|
||||
#include <unordered_map>
|
||||
#include <queue>
|
||||
#include <string.h>
|
||||
#include <cassert>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <iterator>
|
||||
#include <algorithm>
|
||||
|
||||
float tensor_sum_elements(struct ggml_tensor * tensor) {
|
||||
float sum = 0;
|
||||
if (tensor->type==6) {
|
||||
for (int j = 0; j < tensor->ne[1]; j++) {
|
||||
for (int k = 0; k < tensor->ne[0]; k++) {
|
||||
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
|
||||
}
|
||||
}
|
||||
}
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
These are mapping to unknown
|
||||
GGML_TYPE_I8,
|
||||
GGML_TYPE_I16,
|
||||
GGML_TYPE_I32,
|
||||
GGML_TYPE_COUNT,
|
||||
*/
|
||||
|
||||
#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
|
||||
|
||||
#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
|
||||
TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
|
||||
TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
|
||||
{ float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
|
||||
|
||||
struct benchmark_params_struct {
|
||||
int32_t n_threads = 1;
|
||||
int32_t n_iterations = 10;
|
||||
};
|
||||
|
||||
void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations);
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
|
||||
|
||||
struct benchmark_params_struct benchmark_params;
|
||||
|
||||
bool invalid_param = false;
|
||||
std::string arg;
|
||||
for (int i = 1; i < argc; i++) {
|
||||
arg = argv[i];
|
||||
|
||||
if (arg == "-t" || arg == "--threads") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
benchmark_params.n_threads = std::stoi(argv[i]);
|
||||
} else if (arg == "-i" || arg == "--iter") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
benchmark_params.n_iterations = std::stoi(argv[i]);
|
||||
} else if (arg == "-h" || arg == "--help") {
|
||||
print_usage(argc, argv, benchmark_params);
|
||||
exit(0);
|
||||
}
|
||||
if (invalid_param) {
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
print_usage(argc, argv, benchmark_params);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// create the ggml context
|
||||
printf("Starting Test\n");
|
||||
|
||||
|
||||
|
||||
struct ggml_context * ctx;
|
||||
//const int sizex = 4096;
|
||||
//const int sizey = 11008;
|
||||
|
||||
#undef VERBOSE_DEBUGGING
|
||||
#ifndef VERBOSE_DEBUGGING
|
||||
const int sizey = 4096;
|
||||
const int sizex = 11008;
|
||||
const int sizez = 128;
|
||||
#else
|
||||
/* Working - let's increase size */
|
||||
const int sizey = 1;
|
||||
const int sizex = (8*32);
|
||||
const int sizez = 1;
|
||||
|
||||
/*const int sizey = 1;
|
||||
const int sizex = 3*(8*32);
|
||||
const int sizez = 1;*/
|
||||
#endif
|
||||
|
||||
//printf("Memsize required = %i\n", sizex*sizex);
|
||||
ggml_type wtype = GGML_TYPE_F32;
|
||||
|
||||
size_t ctx_size = 0;
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
|
||||
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
|
||||
ctx_size += sizex*sizeof(float);
|
||||
ctx_size += 1024*1024*100;
|
||||
|
||||
printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
|
||||
|
||||
struct ggml_init_params params = {
|
||||
/*.mem_size =*/ ctx_size,
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/* no_alloc =*/ 0
|
||||
};
|
||||
|
||||
ctx = ggml_init(params);
|
||||
if (!ctx) {
|
||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
printf("Creating new tensors\n");
|
||||
// printf("Creating new tensor m1\n");
|
||||
struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||
ggml_set_f32(m11, 1.0f);
|
||||
|
||||
// printf("Creating new tensor m1\n");
|
||||
struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
||||
ggml_set_f32(m12, 1.5f);
|
||||
|
||||
// printf("Creating new tensor m2\n");
|
||||
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
|
||||
ggml_set_f32(m2, 2.0f);
|
||||
|
||||
printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
|
||||
// printf("Creating new tensor m11xm2\n");
|
||||
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
|
||||
|
||||
// printf("Creating compute graph\n");
|
||||
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
|
||||
|
||||
gf.n_threads=benchmark_params.n_threads;
|
||||
printf("cgraph->n_threads=%i\n",gf.n_threads);
|
||||
|
||||
TENSOR_DUMP(m11);
|
||||
TENSOR_DUMP(m2);
|
||||
|
||||
ggml_graph_compute(ctx, &gf);
|
||||
|
||||
TENSOR_DUMP(gf.nodes[0]);
|
||||
|
||||
printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
|
||||
|
||||
int32_t nelements = sizex*sizey;
|
||||
int32_t ne[2] = { sizex, sizey };
|
||||
|
||||
std::vector<int64_t> hist_cur(1 << 4, 0);
|
||||
|
||||
// Set up a the benchmark matrices
|
||||
// printf("Creating new tensor q11 & Running quantize\n");
|
||||
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
|
||||
ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
|
||||
|
||||
// Set up a the compute graph
|
||||
// printf("Creating new tensor q31\n");
|
||||
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
|
||||
|
||||
// printf("Creating compute graph\n");
|
||||
struct ggml_cgraph gf31 = ggml_build_forward(q31);
|
||||
gf31.n_threads=benchmark_params.n_threads;
|
||||
|
||||
// Set up a second graph computation to make sure we override the CPU cache lines
|
||||
// printf("Creating new tensor q12 & Running quantize\n");
|
||||
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
|
||||
ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
|
||||
|
||||
// printf("Creating new tensor q32\n");
|
||||
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
||||
|
||||
//printf("Creating compute graph\n");
|
||||
struct ggml_cgraph gf32 = ggml_build_forward(q32);
|
||||
gf32.n_threads=benchmark_params.n_threads;
|
||||
printf("cgraph->n_threads=%i\n",gf31.n_threads);
|
||||
|
||||
const int dimx = sizex;
|
||||
const int dimy = sizey;
|
||||
const int dimz = sizez;
|
||||
long long int flops_per_dot_product = dimy + dimy;
|
||||
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
|
||||
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
|
||||
|
||||
|
||||
// Let's use the F32 result from above as a reference for the q4_0 multiplication
|
||||
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
|
||||
|
||||
|
||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
|
||||
printf("==============================================================================================\n");
|
||||
|
||||
for (int i=0;i<benchmark_params.n_iterations ;i++) {
|
||||
|
||||
long long int start = ggml_time_us();
|
||||
//printf("Running ggml_graph_compute\n");
|
||||
ggml_graph_compute(ctx, &gf31);
|
||||
long long int stop = ggml_time_us();
|
||||
long long int usec = stop-start;
|
||||
float sec = usec/1000000;
|
||||
float flops_per_usec = (1.0f*flops_per_matrix)/usec;
|
||||
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
|
||||
i,
|
||||
gf31.n_threads,
|
||||
sizex, sizey, sizez, flops_per_matrix,
|
||||
usec,flops_per_usec);
|
||||
|
||||
#ifdef VERBOSE_DEBUGGING
|
||||
TENSOR_DUMP("res",gf31.nodes[0])
|
||||
#endif
|
||||
|
||||
// Check that the matrix multiplication result is in the right ballpark
|
||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
||||
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
|
||||
float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
|
||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
||||
|
||||
if (delta > allowed_delta) {
|
||||
printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
|
||||
sum_of_F32_reference,
|
||||
sum_of_Q4_result,
|
||||
delta,
|
||||
allowed_delta
|
||||
);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// Running a different graph computation to make sure we override the CPU cache lines
|
||||
ggml_graph_compute(ctx, &gf32);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -7,12 +7,6 @@
|
|||
#include <iterator>
|
||||
#include <algorithm>
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||
#include <alloca.h>
|
||||
#endif
|
||||
|
||||
#if defined (_WIN32)
|
||||
#include <fcntl.h>
|
||||
#include <io.h>
|
||||
|
|
|
@ -10,6 +10,6 @@ cd ..
|
|||
./main --color --instruct --threads 4 \
|
||||
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
|
||||
--file ./prompts/alpaca.txt \
|
||||
--batch_size 8 --ctx_size 2048 \
|
||||
--batch_size 8 --ctx_size 2048 -n -1 \
|
||||
--repeat_last_n 64 --repeat_penalty 1.3 \
|
||||
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#include "ggml.h"
|
||||
|
||||
#define LLAMA_API_INTERNAL
|
||||
#include "llama.h"
|
||||
#include "llama_internal.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cassert>
|
||||
|
|
|
@ -28,10 +28,8 @@
|
|||
];
|
||||
installPhase = ''
|
||||
mkdir -p $out/bin
|
||||
mv bin/main $out/bin/llama
|
||||
mv bin/quantize $out/bin/quantize
|
||||
mv bin/embedding $out/bin/embedding
|
||||
mv bin/perplexity $out/bin/perplexity
|
||||
mv bin/* $out/bin/
|
||||
mv $out/bin/main $out/bin/llama
|
||||
|
||||
echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
|
||||
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
|
||||
|
|
206
ggml.c
206
ggml.c
|
@ -114,6 +114,14 @@ typedef void* thread_ret_t;
|
|||
#define GGML_MEM_ALIGN 16
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
|
||||
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
||||
#else
|
||||
#define GGML_ALIGNED_MALLOC(size) aligned_alloc(GGML_MEM_ALIGN, size)
|
||||
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
||||
#endif
|
||||
|
||||
#define UNUSED(x) (void)(x)
|
||||
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
||||
|
||||
|
@ -483,6 +491,77 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
|||
}
|
||||
#endif
|
||||
|
||||
#if __ARM_NEON
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
|
||||
inline static uint16_t vaddvq_u8(uint8x16_t v) {
|
||||
return
|
||||
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
|
||||
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
|
||||
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
|
||||
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
|
||||
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
|
||||
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
|
||||
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
|
||||
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s16(int16x8_t v) {
|
||||
return
|
||||
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
||||
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
||||
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
||||
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
||||
}
|
||||
|
||||
inline static uint32_t vaddvq_u16(uint16x8_t v) {
|
||||
return
|
||||
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
|
||||
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
|
||||
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
|
||||
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
|
||||
}
|
||||
|
||||
inline static int32_t vaddvq_s32(int32x4_t v) {
|
||||
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
||||
}
|
||||
|
||||
inline static float vaddvq_f32(float32x4_t v) {
|
||||
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
||||
}
|
||||
|
||||
inline float vminvq_f32(float32x4_t v) {
|
||||
return
|
||||
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
inline float vmaxvq_f32(float32x4_t v) {
|
||||
return
|
||||
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
||||
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
||||
}
|
||||
|
||||
inline int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
|
||||
return vget_low_s8(vcombine_s8(a, b));
|
||||
}
|
||||
|
||||
inline int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
|
||||
return vget_high_s8(vcombine_s8(a, b));
|
||||
}
|
||||
|
||||
inline uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
||||
return vget_low_u8(vcombine_u8(a, b));
|
||||
}
|
||||
|
||||
inline uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
||||
return vget_high_u8(vcombine_u8(a, b));
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// method 5
|
||||
// blocks of QK elements
|
||||
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
|
||||
|
@ -1210,15 +1289,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
|
|||
#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
||||
#define GGML_F32x4_ADD vaddq_f32
|
||||
#define GGML_F32x4_MUL vmulq_f32
|
||||
#if defined(__ARM_FEATURE_QRDMX)
|
||||
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
||||
#else
|
||||
#define GGML_F32x4_REDUCE_ONE(x) \
|
||||
(vgetq_lane_f32(x, 0) + \
|
||||
vgetq_lane_f32(x, 1) + \
|
||||
vgetq_lane_f32(x, 2) + \
|
||||
vgetq_lane_f32(x, 3))
|
||||
#endif
|
||||
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
|
||||
|
@ -1841,55 +1912,43 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|||
// 4-bit -> 8-bit
|
||||
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
|
||||
const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b));
|
||||
|
||||
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
|
||||
const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4));
|
||||
|
||||
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
|
||||
const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b));
|
||||
|
||||
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
|
||||
const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4));
|
||||
|
||||
// sub 8
|
||||
const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
|
||||
const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
|
||||
|
||||
const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
|
||||
const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
|
||||
|
||||
const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
|
||||
const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b);
|
||||
|
||||
const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
|
||||
const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
// dot product into int16x8_t
|
||||
// dot product into int32x4_t
|
||||
int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);
|
||||
int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);
|
||||
|
||||
p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);
|
||||
p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);
|
||||
|
||||
// scalar
|
||||
#if defined(__ARM_FEATURE_QRDMX)
|
||||
sum0 += x0->d * y0->d * vaddvq_s32(p_0);
|
||||
sum1 += x1->d * y1->d * vaddvq_s32(p_1);
|
||||
#else
|
||||
sum0 += x0->d * y0->d * (vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3));
|
||||
sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
|
||||
#endif
|
||||
sum0 += x0->d*y0->d*vaddvq_s32(p_0);
|
||||
sum1 += x1->d*y1->d*vaddvq_s32(p_1);
|
||||
#else
|
||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
|
||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
|
||||
|
||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
|
||||
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
|
||||
|
||||
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
|
||||
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
|
||||
|
||||
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
|
||||
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
|
||||
|
||||
|
@ -1902,14 +1961,8 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|||
const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);
|
||||
const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);
|
||||
|
||||
// scalar
|
||||
#if defined(__ARM_FEATURE_QRDMX)
|
||||
sum0 += x0->d * y0->d * vaddvq_s16(p_0);
|
||||
sum1 += x1->d * y1->d * vaddvq_s16(p_1);
|
||||
#else
|
||||
sum0 += x0->d * y0->d * (vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7));
|
||||
sum1 += x1->d * y1->d * (vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7));
|
||||
#endif
|
||||
sum0 += x0->d*y0->d*vaddvq_s16(p_0);
|
||||
sum1 += x1->d*y1->d*vaddvq_s16(p_1);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -2152,18 +2205,20 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
|||
const uint8_t * restrict p0 = x[i].qs;
|
||||
const uint8_t * restrict p1 = y[i].qs;
|
||||
|
||||
int sumi = 0;
|
||||
for (int j = 0; j < QK/2; j++) {
|
||||
const uint8_t v0 = p0[j];
|
||||
const uint8_t v1 = p1[j];
|
||||
|
||||
const float f0 = d0*((int8_t) (v0 & 0xf) - 8);
|
||||
const float f1 = d0*((int8_t) (v0 >> 4) - 8);
|
||||
const int8_t i0 = (int8_t) (v0 & 0xf) - 8;
|
||||
const int8_t i1 = (int8_t) (v0 >> 4) - 8;
|
||||
|
||||
const float f2 = d1*((int8_t) (v1 & 0xf) - 8);
|
||||
const float f3 = d1*((int8_t) (v1 >> 4) - 8);
|
||||
const int8_t i2 = (int8_t) (v1 & 0xf) - 8;
|
||||
const int8_t i3 = (int8_t) (v1 >> 4) - 8;
|
||||
|
||||
sumf += f0*f2 + f1*f3;
|
||||
sumi += i0*i2 + i1*i3;
|
||||
}
|
||||
sumf += d0 * d1 * sumi;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -2255,36 +2310,71 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
|
|||
float sum10 = 0.0f;
|
||||
float sum11 = 0.0f;
|
||||
|
||||
for (int i = 0; i < nb; ++i) {
|
||||
for (int i = 0; i < nb; i += 2) {
|
||||
const block_q4_1 * restrict x0 = &x[i + 0];
|
||||
const block_q4_1 * restrict y0 = &y[i + 0];
|
||||
const block_q4_1 * restrict x1 = &x[i + 1];
|
||||
const block_q4_1 * restrict y1 = &y[i + 1];
|
||||
|
||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||
|
||||
const uint8x16_t v0_0 = vld1q_u8(x0->qs);
|
||||
const uint8x16_t v1_0 = vld1q_u8(y0->qs);
|
||||
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
|
||||
const uint8x16_t v1_1 = vld1q_u8(y1->qs);
|
||||
|
||||
// and with 0xf
|
||||
// 4-bit -> 8-bit
|
||||
const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
|
||||
const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
|
||||
|
||||
const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
|
||||
const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
|
||||
|
||||
// dot product into uint16x8_t
|
||||
const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
|
||||
const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
|
||||
|
||||
const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
|
||||
const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
|
||||
|
||||
const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h);
|
||||
const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h);
|
||||
const uint8x16_t v0_1l = vandq_u8(v0_1, m4b);
|
||||
const uint8x16_t v1_1l = vandq_u8(v1_1, m4b);
|
||||
const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4);
|
||||
const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4);
|
||||
|
||||
sum00 += x0->m*y0->m;
|
||||
sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h));
|
||||
sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h));
|
||||
sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0));
|
||||
|
||||
sum00 += x1->m*y1->m;
|
||||
sum01 += y1->m*x1->d*(vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h));
|
||||
sum10 += x1->m*y1->d*(vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h));
|
||||
|
||||
#if defined(__ARM_FEATURE_DOTPROD)
|
||||
// dot product into int32x4_t
|
||||
int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l);
|
||||
int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l);
|
||||
|
||||
p_0 = vdotq_s32(p_0, v0_0h, v1_0h);
|
||||
p_1 = vdotq_s32(p_1, v0_1h, v1_1h);
|
||||
|
||||
sum11 += x0->d*y0->d*vaddvq_s32(p_0);
|
||||
sum11 += x1->d*y1->d*vaddvq_s32(p_1);
|
||||
#else
|
||||
const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
|
||||
const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
|
||||
const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
|
||||
const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
|
||||
|
||||
const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l));
|
||||
const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l));
|
||||
const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h));
|
||||
const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h));
|
||||
|
||||
const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h);
|
||||
const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h);
|
||||
|
||||
const uint16x8_t pl_1 = vaddq_u16(pl1l, pl1h);
|
||||
const uint16x8_t ph_1 = vaddq_u16(ph1l, ph1h);
|
||||
|
||||
const uint16x8_t p_0 = vaddq_u16(pl_0, ph_0);
|
||||
const uint16x8_t p_1 = vaddq_u16(pl_1, ph_1);
|
||||
|
||||
sum11 += x0->d*y0->d*vaddvq_u16(p_0);
|
||||
sum11 += x1->d*y1->d*vaddvq_u16(p_1);
|
||||
#endif
|
||||
}
|
||||
|
||||
sumf = QK*sum00 + sum01 + sum10 + sum11;
|
||||
|
@ -2966,7 +3056,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||
|
||||
*ctx = (struct ggml_context) {
|
||||
/*.mem_size =*/ params.mem_size,
|
||||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
|
||||
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(params.mem_size),
|
||||
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
|
||||
/*.no_alloc =*/ params.no_alloc,
|
||||
/*.n_objects =*/ 0,
|
||||
|
@ -3001,7 +3091,7 @@ void ggml_free(struct ggml_context * ctx) {
|
|||
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
|
||||
|
||||
if (ctx->mem_buffer_owned) {
|
||||
free(ctx->mem_buffer);
|
||||
GGML_ALIGNED_FREE(ctx->mem_buffer);
|
||||
}
|
||||
|
||||
found = true;
|
||||
|
@ -6435,7 +6525,7 @@ static void ggml_compute_forward_mul_mat_f32(
|
|||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
}
|
||||
}
|
||||
|
@ -6607,7 +6697,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
|
|||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
}
|
||||
}
|
||||
|
@ -6820,7 +6910,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
|
|||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
ne11, ne01, ne10,
|
||||
1.0f, y, ne10,
|
||||
x, ne10,
|
||||
x, ne00,
|
||||
0.0f, d, ne01);
|
||||
}
|
||||
}
|
||||
|
@ -9273,7 +9363,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
|
|||
struct ggml_cgraph result = {
|
||||
/*.n_nodes =*/ 0,
|
||||
/*.n_leafs =*/ 0,
|
||||
/*.n_threads =*/ 0,
|
||||
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
|
||||
/*.work_size =*/ 0,
|
||||
/*.work =*/ NULL,
|
||||
/*.nodes =*/ { NULL },
|
||||
|
@ -9893,8 +9983,8 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
|
|||
|
||||
GGML_PRINT("=== GRAPH ===\n");
|
||||
|
||||
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
|
||||
GGML_PRINT_DEBUG("total work size = %zu bytes\n",cgraph->work_size);
|
||||
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
|
||||
GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
|
||||
|
||||
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
|
|
11
ggml.h
11
ggml.h
|
@ -177,11 +177,12 @@ extern "C" {
|
|||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define GGML_MAX_DIMS 4
|
||||
#define GGML_MAX_NODES 4096
|
||||
#define GGML_MAX_PARAMS 16
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_OPT 4
|
||||
#define GGML_MAX_DIMS 4
|
||||
#define GGML_MAX_NODES 4096
|
||||
#define GGML_MAX_PARAMS 16
|
||||
#define GGML_MAX_CONTEXTS 64
|
||||
#define GGML_MAX_OPT 4
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
// we use the built-in 16-bit float type
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
|
||||
#include "llama_util.h"
|
||||
#include "llama.h"
|
||||
#include "llama_internal.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
|
|
11
llama.h
11
llama.h
|
@ -179,4 +179,15 @@ extern "C" {
|
|||
}
|
||||
#endif
|
||||
|
||||
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
|
||||
#ifdef LLAMA_API_INTERNAL
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
struct ggml_tensor;
|
||||
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
|
||||
#endif
|
||||
|
||||
#endif // LLAMA_H
|
||||
|
|
|
@ -1,12 +0,0 @@
|
|||
// Internal header to be included by llama.cpp and tests/benchmarks only.
|
||||
|
||||
#ifndef LLAMA_INTERNAL_H
|
||||
#define LLAMA_INTERNAL_H
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
struct ggml_tensor;
|
||||
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
|
||||
|
||||
#endif // LLAMA_INTERNAL_H
|
|
@ -4,4 +4,4 @@ User: Hello, Bob.
|
|||
Bob: Hello. How may I help you today?
|
||||
User: Please tell me the largest city in Europe.
|
||||
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
||||
User:
|
||||
User:
|
|
@ -15,4 +15,4 @@ Answer: The calculate tool says it is 9.3333333333
|
|||
Question: What is capital of france?
|
||||
Thought: Do I need to use an action? No, I know the answer
|
||||
Answer: Paris is the capital of France
|
||||
Question:
|
||||
Question:
|
Loading…
Add table
Add a link
Reference in a new issue