Merge pull request #1 from ggerganov/master

pull down new code
This commit is contained in:
wbpxre150 2023-04-14 12:48:36 +08:00 committed by GitHub
commit 18cf46c781
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 527 additions and 136 deletions

View file

@ -14,3 +14,6 @@ indent_size = 4
[Makefile] [Makefile]
indent_style = tab indent_style = tab
[prompts/*.txt]
insert_final_newline = unset

1
.gitignore vendored
View file

@ -23,6 +23,7 @@ models/*
/result /result
/perplexity /perplexity
/embedding /embedding
/benchmark-q4_0-matmult
/Pipfile /Pipfile
arm_neon.h arm_neon.h

View file

@ -56,6 +56,10 @@ option(LLAMA_AVX "llama: enable AVX"
option(LLAMA_AVX2 "llama: enable AVX2" ON) option(LLAMA_AVX2 "llama: enable AVX2" ON)
option(LLAMA_AVX512 "llama: enable AVX512" OFF) option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_FMA "llama: enable FMA" ON) option(LLAMA_FMA "llama: enable FMA" ON)
# in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC)
option(LLAMA_F16C "llama: enable F16C" ON)
endif()
# 3rd party libs # 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
@ -207,7 +211,9 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
add_compile_options(/arch:AVX) add_compile_options(/arch:AVX)
endif() endif()
else() else()
if (LLAMA_F16C)
add_compile_options(-mf16c) add_compile_options(-mf16c)
endif()
if (LLAMA_FMA) if (LLAMA_FMA)
add_compile_options(-mfma) add_compile_options(-mfma)
endif() endif()
@ -247,7 +253,6 @@ endif()
add_library(llama add_library(llama
llama.cpp llama.cpp
llama.h llama.h
llama_internal.h
llama_util.h) llama_util.h)
target_include_directories(llama PUBLIC .) target_include_directories(llama PUBLIC .)

View file

@ -142,14 +142,14 @@ default: main quantize perplexity embedding
ggml.o: ggml.c ggml.h ggml.o: ggml.c ggml.h
$(CC) $(CFLAGS) -c ggml.c -o ggml.o $(CC) $(CFLAGS) -c ggml.c -o ggml.o
llama.o: llama.cpp llama.h llama_util.h llama_internal.h llama.o: llama.cpp llama.h llama_util.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
common.o: examples/common.cpp examples/common.h common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
clean: clean:
rm -vf *.o main quantize quantize-stats perplexity embedding rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
main: examples/main/main.cpp ggml.o llama.o common.o main: examples/main/main.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@ -171,10 +171,15 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
libllama.so: llama.o ggml.o libllama.so: llama.o ggml.o
$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
# #
# Tests # Tests
# #
benchmark: ggml.o
$(CXX) $(CXXFLAGS) examples/benchmark/benchmark-q4_0-matmult.c ggml.o -o benchmark-q4_0-matmult $(LDFLAGS)
./benchmark-q4_0-matmult
.PHONY: tests .PHONY: tests
tests: tests:
bash ./tests/run-tests.sh bash ./tests/run-tests.sh

View file

@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:** **Hot topics:**
- [Add GPU support to ggml](https://github.com/ggerganov/llama.cpp/discussions/915)
- [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784) - [Roadmap Apr 2023](https://github.com/ggerganov/llama.cpp/discussions/784)
## Description ## Description
@ -48,6 +49,7 @@ New features will probably be added mostly through community contributions.
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp) - Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
- Node.js: [hlhr202/llama-node](https://github.com/hlhr202/llama-node)
**UI:** **UI:**
@ -148,21 +150,43 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
## Usage ## Usage
Here are the step for the LLaMA-7B model: Here are the step for the LLaMA-7B model.
### Get the Code
```bash ```bash
# build this repo
git clone https://github.com/ggerganov/llama.cpp git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp cd llama.cpp
make ```
#For Windows and CMake, use the following command instead: ### Build
cd <path_to_llama_folder>
mkdir build
cd build
cmake ..
cmake --build . --config Release
Note: For Windows, CMake or Zig can be used.
1. Use `make`
```bash
make
```
1. Use CMake
```bash
mkdir build
cd build
cmake ..
cmake --build . --config Release
```
1. Use Zig
```bash
zig build -Drelease-fast
```
### Prepare Data & Run
```bash
# obtain the original LLaMA model weights and place them in ./models # obtain the original LLaMA model weights and place them in ./models
ls ./models ls ./models
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
@ -180,8 +204,6 @@ python3 convert-pth-to-ggml.py models/7B/ 1
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
``` ```
Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11.
When running the larger models, make sure you have enough disk space to store all the intermediate files. When running the larger models, make sure you have enough disk space to store all the intermediate files.
### Memory/Disk Requirements ### Memory/Disk Requirements

View file

@ -1,16 +1,14 @@
const std = @import("std"); const std = @import("std");
pub fn build(b: *std.Build) void { pub fn build(b: *std.build.Builder) void {
const target = b.standardTargetOptions(.{}); const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{}); const optimize = b.standardReleaseOptions();
const want_lto = b.option(bool, "lto", "Want -fLTO"); const want_lto = b.option(bool, "lto", "Want -fLTO");
const lib = b.addStaticLibrary(.{ const lib = b.addStaticLibrary("llama", null);
.name = "llama",
.target = target,
.optimize = optimize,
});
lib.want_lto = want_lto; lib.want_lto = want_lto;
lib.setTarget(target);
lib.setBuildMode(optimize);
lib.linkLibCpp(); lib.linkLibCpp();
lib.addIncludePath("."); lib.addIncludePath(".");
lib.addIncludePath("examples"); lib.addIncludePath("examples");
@ -44,16 +42,12 @@ pub fn build(b: *std.Build) void {
fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep { fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
const b = args.b; const b = args.b;
const lib = args.lib; const lib = args.lib;
const target = args.target;
const optimize = args.optimize;
const want_lto = args.want_lto; const want_lto = args.want_lto;
const exe = b.addExecutable(.{ const exe = b.addExecutable(name, null);
.name = name,
.target = target,
.optimize = optimize,
});
exe.want_lto = want_lto; exe.want_lto = want_lto;
lib.setTarget(args.target);
lib.setBuildMode(args.optimize);
exe.addIncludePath("."); exe.addIncludePath(".");
exe.addIncludePath("examples"); exe.addIncludePath("examples");
exe.addCSourceFiles(&.{ exe.addCSourceFiles(&.{

View file

@ -7,4 +7,4 @@
cd `dirname $0` cd `dirname $0`
cd .. cd ..
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 ./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt --ctx_size 2048 -n -1 -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7

View file

@ -0,0 +1,270 @@
/*
License: MIT License
Changelog:
- 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel)
*/
#include <locale.h>
#include "ggml.h"
#include <assert.h>
#include <math.h>
#include <cstring>
#include <cstdio>
#include <cinttypes>
#include <unordered_map>
#include <queue>
#include <string.h>
#include <cassert>
#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
float tensor_sum_elements(struct ggml_tensor * tensor) {
float sum = 0;
if (tensor->type==6) {
for (int j = 0; j < tensor->ne[1]; j++) {
for (int k = 0; k < tensor->ne[0]; k++) {
sum += ((float *) tensor->data)[j*tensor->ne[0]+k];
}
}
}
return sum;
}
/*
These are mapping to unknown
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
GGML_TYPE_COUNT,
*/
#define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN"
#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \
TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\
TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \
{ float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); }
struct benchmark_params_struct {
int32_t n_threads = 1;
int32_t n_iterations = 10;
};
void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations);
fprintf(stderr, "\n");
}
int main(int argc, char ** argv) {
struct benchmark_params_struct benchmark_params;
bool invalid_param = false;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
benchmark_params.n_threads = std::stoi(argv[i]);
} else if (arg == "-i" || arg == "--iter") {
if (++i >= argc) {
invalid_param = true;
break;
}
benchmark_params.n_iterations = std::stoi(argv[i]);
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv, benchmark_params);
exit(0);
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv, benchmark_params);
exit(1);
}
}
// create the ggml context
printf("Starting Test\n");
struct ggml_context * ctx;
//const int sizex = 4096;
//const int sizey = 11008;
#undef VERBOSE_DEBUGGING
#ifndef VERBOSE_DEBUGGING
const int sizey = 4096;
const int sizex = 11008;
const int sizez = 128;
#else
/* Working - let's increase size */
const int sizey = 1;
const int sizex = (8*32);
const int sizez = 1;
/*const int sizey = 1;
const int sizex = 3*(8*32);
const int sizez = 1;*/
#endif
//printf("Memsize required = %i\n", sizex*sizex);
ggml_type wtype = GGML_TYPE_F32;
size_t ctx_size = 0;
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
ctx_size += sizex*sizey*ggml_type_sizef(wtype);
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32);
ctx_size += sizex*sizeof(float);
ctx_size += 1024*1024*100;
printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024));
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/* no_alloc =*/ 0
};
ctx = ggml_init(params);
if (!ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
printf("Creating new tensors\n");
// printf("Creating new tensor m1\n");
struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
ggml_set_f32(m11, 1.0f);
// printf("Creating new tensor m1\n");
struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
ggml_set_f32(m12, 1.5f);
// printf("Creating new tensor m2\n");
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
ggml_set_f32(m2, 2.0f);
printf("\n------ Test 1 - Matrix Mult via F32 code ------------------------------------------------------------------------------\n");
// printf("Creating new tensor m11xm2\n");
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
// printf("Creating compute graph\n");
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
gf.n_threads=benchmark_params.n_threads;
printf("cgraph->n_threads=%i\n",gf.n_threads);
TENSOR_DUMP(m11);
TENSOR_DUMP(m2);
ggml_graph_compute(ctx, &gf);
TENSOR_DUMP(gf.nodes[0]);
printf("\n------ Test 2 - Matrix Mult via Q4_0 code ------------------------------------------------------------------------------\n");
int32_t nelements = sizex*sizey;
int32_t ne[2] = { sizex, sizey };
std::vector<int64_t> hist_cur(1 << 4, 0);
// Set up a the benchmark matrices
// printf("Creating new tensor q11 & Running quantize\n");
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
ggml_quantize_q4_0((const float *) m11->data, q11->data, nelements, ne[0], hist_cur.data());
// Set up a the compute graph
// printf("Creating new tensor q31\n");
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
// printf("Creating compute graph\n");
struct ggml_cgraph gf31 = ggml_build_forward(q31);
gf31.n_threads=benchmark_params.n_threads;
// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, sizex, sizey);
ggml_quantize_q4_0((const float *) m12->data, q12->data, nelements, ne[0], hist_cur.data());
// printf("Creating new tensor q32\n");
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
//printf("Creating compute graph\n");
struct ggml_cgraph gf32 = ggml_build_forward(q32);
gf32.n_threads=benchmark_params.n_threads;
printf("cgraph->n_threads=%i\n",gf31.n_threads);
const int dimx = sizex;
const int dimy = sizey;
const int dimz = sizez;
long long int flops_per_dot_product = dimy + dimy;
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
// Let's use the F32 result from above as a reference for the q4_0 multiplication
float sum_of_F32_reference = tensor_sum_elements(gf.nodes[0]);
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; FLOPS_per_u_Second\n");
printf("==============================================================================================\n");
for (int i=0;i<benchmark_params.n_iterations ;i++) {
long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
ggml_graph_compute(ctx, &gf31);
long long int stop = ggml_time_us();
long long int usec = stop-start;
float sec = usec/1000000;
float flops_per_usec = (1.0f*flops_per_matrix)/usec;
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n",
i,
gf31.n_threads,
sizex, sizey, sizez, flops_per_matrix,
usec,flops_per_usec);
#ifdef VERBOSE_DEBUGGING
TENSOR_DUMP("res",gf31.nodes[0])
#endif
// Check that the matrix multiplication result is in the right ballpark
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
float sum_of_Q4_result = tensor_sum_elements(gf31.nodes[0]);
float delta = abs(sum_of_Q4_result - sum_of_F32_reference);
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
if (delta > allowed_delta) {
printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
sum_of_F32_reference,
sum_of_Q4_result,
delta,
allowed_delta
);
exit(0);
}
// Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute(ctx, &gf32);
}
}

View file

@ -7,12 +7,6 @@
#include <iterator> #include <iterator>
#include <algorithm> #include <algorithm>
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
#include <alloca.h>
#endif
#if defined (_WIN32) #if defined (_WIN32)
#include <fcntl.h> #include <fcntl.h>
#include <io.h> #include <io.h>

View file

@ -10,6 +10,6 @@ cd ..
./main --color --instruct --threads 4 \ ./main --color --instruct --threads 4 \
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \ --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
--file ./prompts/alpaca.txt \ --file ./prompts/alpaca.txt \
--batch_size 8 --ctx_size 2048 \ --batch_size 8 --ctx_size 2048 -n -1 \
--repeat_last_n 64 --repeat_penalty 1.3 \ --repeat_last_n 64 --repeat_penalty 1.3 \
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95 --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95

View file

@ -27,21 +27,28 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
int count = 0; int count = 0;
int seq_count = tokens.size() / params.n_ctx; int seq_count = tokens.size() / params.n_ctx;
int n_vocab = llama_n_vocab(ctx);
double nll = 0.0; double nll = 0.0;
fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch);
fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
for (int i = 0; i < seq_count; ++i) { for (int i = 0; i < seq_count; ++i) {
int start = i * params.n_ctx; int start = i * params.n_ctx;
int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512 int end = start + params.n_ctx;
// it is better to always be power of 2 for better performance
std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end); std::vector<float> logits;
int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch;
auto start_t = std::chrono::high_resolution_clock::now(); auto start_t = std::chrono::high_resolution_clock::now();
if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) { for (int j = 0; j < num_batches; ++j) {
int batch_start = start + j * params.n_batch;
int batch_size = std::min(end - batch_start, params.n_batch);
if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__); fprintf(stderr, "%s : failed to eval\n", __func__);
return; return;
} }
auto batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
auto end_t = std::chrono::high_resolution_clock::now(); auto end_t = std::chrono::high_resolution_clock::now();
if (i == 0) { if (i == 0) {
const float seconds = std::chrono::duration<float>(end_t - start_t).count(); const float seconds = std::chrono::duration<float>(end_t - start_t).count();
@ -59,15 +66,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
// Example, we have a context window of 512, we will compute perplexity for each of the // Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to // last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt. // process the entire prompt.
for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
auto logits = llama_get_logits(ctx);
for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones. // Calculate probability of next token, given the previous ones.
int n_vocab = llama_n_vocab(ctx);
std::vector<float> tok_logits( std::vector<float> tok_logits(
logits + j * n_vocab, logits.begin() + j * n_vocab,
logits + (j + 1) * n_vocab); logits.begin() + (j + 1) * n_vocab);
const float prob = softmax(tok_logits)[tokens[start + j + 1]]; float prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob); nll += -std::log(prob);
++count; ++count;
} }
@ -82,11 +86,13 @@ int main(int argc, char ** argv) {
gpt_params params; gpt_params params;
params.model = "models/llama-7B/ggml-model.bin"; params.model = "models/llama-7B/ggml-model.bin";
params.n_batch = 512;
if (gpt_params_parse(argc, argv, params) == false) { if (gpt_params_parse(argc, argv, params) == false) {
return 1; return 1;
} }
params.perplexity = true; params.perplexity = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
if (params.n_ctx > 2048) { if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"

View file

@ -1,6 +1,7 @@
#include "ggml.h" #include "ggml.h"
#define LLAMA_API_INTERNAL
#include "llama.h" #include "llama.h"
#include "llama_internal.h"
#include <algorithm> #include <algorithm>
#include <cassert> #include <cassert>

View file

@ -28,10 +28,8 @@
]; ];
installPhase = '' installPhase = ''
mkdir -p $out/bin mkdir -p $out/bin
mv bin/main $out/bin/llama mv bin/* $out/bin/
mv bin/quantize $out/bin/quantize mv $out/bin/main $out/bin/llama
mv bin/embedding $out/bin/embedding
mv bin/perplexity $out/bin/perplexity
echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml

204
ggml.c
View file

@ -114,6 +114,14 @@ typedef void* thread_ret_t;
#define GGML_MEM_ALIGN 16 #define GGML_MEM_ALIGN 16
#endif #endif
#if defined(_MSC_VER) || defined(__MINGW32__)
#define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN)
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
#else
#define GGML_ALIGNED_MALLOC(size) aligned_alloc(GGML_MEM_ALIGN, size)
#define GGML_ALIGNED_FREE(ptr) free(ptr)
#endif
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
@ -483,6 +491,77 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
} }
#endif #endif
#if __ARM_NEON
#if !defined(__aarch64__)
inline static uint16_t vaddvq_u8(uint8x16_t v) {
return
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
}
inline static int32_t vaddvq_s16(int16x8_t v) {
return
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
}
inline static uint32_t vaddvq_u16(uint16x8_t v) {
return
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
}
inline static int32_t vaddvq_s32(int32x4_t v) {
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
}
inline static float vaddvq_f32(float32x4_t v) {
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
}
inline float vminvq_f32(float32x4_t v) {
return
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
inline float vmaxvq_f32(float32x4_t v) {
return
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
}
inline int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
return vget_low_s8(vcombine_s8(a, b));
}
inline int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
return vget_high_s8(vcombine_s8(a, b));
}
inline uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
return vget_low_u8(vcombine_u8(a, b));
}
inline uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
return vget_high_u8(vcombine_u8(a, b));
}
#endif
#endif
// method 5 // method 5
// blocks of QK elements // blocks of QK elements
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
@ -1210,15 +1289,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c) #define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
#define GGML_F32x4_ADD vaddq_f32 #define GGML_F32x4_ADD vaddq_f32
#define GGML_F32x4_MUL vmulq_f32 #define GGML_F32x4_MUL vmulq_f32
#if defined(__ARM_FEATURE_QRDMX) #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
#else
#define GGML_F32x4_REDUCE_ONE(x) \
(vgetq_lane_f32(x, 0) + \
vgetq_lane_f32(x, 1) + \
vgetq_lane_f32(x, 2) + \
vgetq_lane_f32(x, 3))
#endif
#define GGML_F32x4_REDUCE(res, x) \ #define GGML_F32x4_REDUCE(res, x) \
{ \ { \
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \ for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
@ -1841,55 +1912,43 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
// 4-bit -> 8-bit // 4-bit -> 8-bit
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b)); const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));
const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b)); const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b));
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4)); const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4)); const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4));
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b)); const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));
const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b)); const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b));
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4)); const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4)); const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4));
// sub 8 // sub 8
const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b); const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);
const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b); const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);
const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b); const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);
const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b); const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);
const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b); const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b);
const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b); const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
#if defined(__ARM_FEATURE_DOTPROD) #if defined(__ARM_FEATURE_DOTPROD)
// dot product into int16x8_t // dot product into int32x4_t
int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls); int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);
int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls); int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);
p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs); p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);
p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs); p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);
// scalar sum0 += x0->d*y0->d*vaddvq_s32(p_0);
#if defined(__ARM_FEATURE_QRDMX) sum1 += x1->d*y1->d*vaddvq_s32(p_1);
sum0 += x0->d * y0->d * vaddvq_s32(p_0);
sum1 += x1->d * y1->d * vaddvq_s32(p_1);
#else
sum0 += x0->d * y0->d * (vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3));
sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
#endif
#else #else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
@ -1902,14 +1961,8 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
const int16x8_t p_0 = vaddq_s16(pl_0, ph_0); const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);
const int16x8_t p_1 = vaddq_s16(pl_1, ph_1); const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);
// scalar sum0 += x0->d*y0->d*vaddvq_s16(p_0);
#if defined(__ARM_FEATURE_QRDMX) sum1 += x1->d*y1->d*vaddvq_s16(p_1);
sum0 += x0->d * y0->d * vaddvq_s16(p_0);
sum1 += x1->d * y1->d * vaddvq_s16(p_1);
#else
sum0 += x0->d * y0->d * (vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7));
sum1 += x1->d * y1->d * (vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7));
#endif
#endif #endif
} }
@ -2152,18 +2205,20 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
const uint8_t * restrict p0 = x[i].qs; const uint8_t * restrict p0 = x[i].qs;
const uint8_t * restrict p1 = y[i].qs; const uint8_t * restrict p1 = y[i].qs;
int sumi = 0;
for (int j = 0; j < QK/2; j++) { for (int j = 0; j < QK/2; j++) {
const uint8_t v0 = p0[j]; const uint8_t v0 = p0[j];
const uint8_t v1 = p1[j]; const uint8_t v1 = p1[j];
const float f0 = d0*((int8_t) (v0 & 0xf) - 8); const int8_t i0 = (int8_t) (v0 & 0xf) - 8;
const float f1 = d0*((int8_t) (v0 >> 4) - 8); const int8_t i1 = (int8_t) (v0 >> 4) - 8;
const float f2 = d1*((int8_t) (v1 & 0xf) - 8); const int8_t i2 = (int8_t) (v1 & 0xf) - 8;
const float f3 = d1*((int8_t) (v1 >> 4) - 8); const int8_t i3 = (int8_t) (v1 >> 4) - 8;
sumf += f0*f2 + f1*f3; sumi += i0*i2 + i1*i3;
} }
sumf += d0 * d1 * sumi;
} }
#endif #endif
@ -2255,36 +2310,71 @@ static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * rest
float sum10 = 0.0f; float sum10 = 0.0f;
float sum11 = 0.0f; float sum11 = 0.0f;
for (int i = 0; i < nb; ++i) { for (int i = 0; i < nb; i += 2) {
const block_q4_1 * restrict x0 = &x[i + 0]; const block_q4_1 * restrict x0 = &x[i + 0];
const block_q4_1 * restrict y0 = &y[i + 0]; const block_q4_1 * restrict y0 = &y[i + 0];
const block_q4_1 * restrict x1 = &x[i + 1];
const block_q4_1 * restrict y1 = &y[i + 1];
const uint8x16_t m4b = vdupq_n_u8(0xf); const uint8x16_t m4b = vdupq_n_u8(0xf);
const uint8x16_t v0_0 = vld1q_u8(x0->qs); const uint8x16_t v0_0 = vld1q_u8(x0->qs);
const uint8x16_t v1_0 = vld1q_u8(y0->qs); const uint8x16_t v1_0 = vld1q_u8(y0->qs);
const uint8x16_t v0_1 = vld1q_u8(x1->qs);
const uint8x16_t v1_1 = vld1q_u8(y1->qs);
// and with 0xf // 4-bit -> 8-bit
const uint8x16_t v0_0l = vandq_u8(v0_0, m4b); const uint8x16_t v0_0l = vandq_u8(v0_0, m4b);
const uint8x16_t v1_0l = vandq_u8(v1_0, m4b); const uint8x16_t v1_0l = vandq_u8(v1_0, m4b);
const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4); const uint8x16_t v0_0h = vshrq_n_u8(v0_0, 4);
const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4); const uint8x16_t v1_0h = vshrq_n_u8(v1_0, 4);
// dot product into uint16x8_t const uint8x16_t v0_1l = vandq_u8(v0_1, m4b);
const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l)); const uint8x16_t v1_1l = vandq_u8(v1_1, m4b);
const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l)); const uint8x16_t v0_1h = vshrq_n_u8(v0_1, 4);
const uint8x16_t v1_1h = vshrq_n_u8(v1_1, 4);
const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
const uint16x8_t pl0 = vaddq_u16(pl0l, pl0h);
const uint16x8_t ph0 = vaddq_u16(ph0l, ph0h);
sum00 += x0->m*y0->m; sum00 += x0->m*y0->m;
sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h)); sum01 += y0->m*x0->d*(vaddvq_u8(v0_0l) + vaddvq_u8(v0_0h));
sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h)); sum10 += x0->m*y0->d*(vaddvq_u8(v1_0l) + vaddvq_u8(v1_0h));
sum11 += x0->d*y0->d*vaddvq_u16(vaddq_u16(pl0, ph0));
sum00 += x1->m*y1->m;
sum01 += y1->m*x1->d*(vaddvq_u8(v0_1l) + vaddvq_u8(v0_1h));
sum10 += x1->m*y1->d*(vaddvq_u8(v1_1l) + vaddvq_u8(v1_1h));
#if defined(__ARM_FEATURE_DOTPROD)
// dot product into int32x4_t
int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l);
int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l);
p_0 = vdotq_s32(p_0, v0_0h, v1_0h);
p_1 = vdotq_s32(p_1, v0_1h, v1_1h);
sum11 += x0->d*y0->d*vaddvq_s32(p_0);
sum11 += x1->d*y1->d*vaddvq_s32(p_1);
#else
const uint16x8_t pl0l = vmull_u8(vget_low_u8 (v0_0l), vget_low_u8 (v1_0l));
const uint16x8_t pl0h = vmull_u8(vget_high_u8(v0_0l), vget_high_u8(v1_0l));
const uint16x8_t ph0l = vmull_u8(vget_low_u8 (v0_0h), vget_low_u8 (v1_0h));
const uint16x8_t ph0h = vmull_u8(vget_high_u8(v0_0h), vget_high_u8(v1_0h));
const uint16x8_t pl1l = vmull_u8(vget_low_u8 (v0_1l), vget_low_u8 (v1_1l));
const uint16x8_t pl1h = vmull_u8(vget_high_u8(v0_1l), vget_high_u8(v1_1l));
const uint16x8_t ph1l = vmull_u8(vget_low_u8 (v0_1h), vget_low_u8 (v1_1h));
const uint16x8_t ph1h = vmull_u8(vget_high_u8(v0_1h), vget_high_u8(v1_1h));
const uint16x8_t pl_0 = vaddq_u16(pl0l, pl0h);
const uint16x8_t ph_0 = vaddq_u16(ph0l, ph0h);
const uint16x8_t pl_1 = vaddq_u16(pl1l, pl1h);
const uint16x8_t ph_1 = vaddq_u16(ph1l, ph1h);
const uint16x8_t p_0 = vaddq_u16(pl_0, ph_0);
const uint16x8_t p_1 = vaddq_u16(pl_1, ph_1);
sum11 += x0->d*y0->d*vaddvq_u16(p_0);
sum11 += x1->d*y1->d*vaddvq_u16(p_1);
#endif
} }
sumf = QK*sum00 + sum01 + sum10 + sum11; sumf = QK*sum00 + sum01 + sum10 + sum11;
@ -2966,7 +3056,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
*ctx = (struct ggml_context) { *ctx = (struct ggml_context) {
/*.mem_size =*/ params.mem_size, /*.mem_size =*/ params.mem_size,
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size), /*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : GGML_ALIGNED_MALLOC(params.mem_size),
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true, /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
/*.no_alloc =*/ params.no_alloc, /*.no_alloc =*/ params.no_alloc,
/*.n_objects =*/ 0, /*.n_objects =*/ 0,
@ -3001,7 +3091,7 @@ void ggml_free(struct ggml_context * ctx) {
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size); __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
if (ctx->mem_buffer_owned) { if (ctx->mem_buffer_owned) {
free(ctx->mem_buffer); GGML_ALIGNED_FREE(ctx->mem_buffer);
} }
found = true; found = true;
@ -6435,7 +6525,7 @@ static void ggml_compute_forward_mul_mat_f32(
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10, ne11, ne01, ne10,
1.0f, y, ne10, 1.0f, y, ne10,
x, ne10, x, ne00,
0.0f, d, ne01); 0.0f, d, ne01);
} }
} }
@ -6607,7 +6697,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10, ne11, ne01, ne10,
1.0f, y, ne10, 1.0f, y, ne10,
x, ne10, x, ne00,
0.0f, d, ne01); 0.0f, d, ne01);
} }
} }
@ -6820,7 +6910,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
ne11, ne01, ne10, ne11, ne01, ne10,
1.0f, y, ne10, 1.0f, y, ne10,
x, ne10, x, ne00,
0.0f, d, ne01); 0.0f, d, ne01);
} }
} }
@ -9273,7 +9363,7 @@ struct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {
struct ggml_cgraph result = { struct ggml_cgraph result = {
/*.n_nodes =*/ 0, /*.n_nodes =*/ 0,
/*.n_leafs =*/ 0, /*.n_leafs =*/ 0,
/*.n_threads =*/ 0, /*.n_threads =*/ GGML_DEFAULT_N_THREADS,
/*.work_size =*/ 0, /*.work_size =*/ 0,
/*.work =*/ NULL, /*.work =*/ NULL,
/*.nodes =*/ { NULL }, /*.nodes =*/ { NULL },
@ -9894,7 +9984,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT("=== GRAPH ===\n"); GGML_PRINT("=== GRAPH ===\n");
GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads); GGML_PRINT_DEBUG("n_threads = %d\n", cgraph->n_threads);
GGML_PRINT_DEBUG("total work size = %zu bytes\n",cgraph->work_size); GGML_PRINT_DEBUG("total work size = %zu bytes\n", cgraph->work_size);
GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes);
for (int i = 0; i < cgraph->n_nodes; i++) { for (int i = 0; i < cgraph->n_nodes; i++) {

1
ggml.h
View file

@ -182,6 +182,7 @@ extern "C" {
#define GGML_MAX_PARAMS 16 #define GGML_MAX_PARAMS 16
#define GGML_MAX_CONTEXTS 64 #define GGML_MAX_CONTEXTS 64
#define GGML_MAX_OPT 4 #define GGML_MAX_OPT 4
#define GGML_DEFAULT_N_THREADS 4
#ifdef __ARM_NEON #ifdef __ARM_NEON
// we use the built-in 16-bit float type // we use the built-in 16-bit float type

View file

@ -5,7 +5,6 @@
#include "llama_util.h" #include "llama_util.h"
#include "llama.h" #include "llama.h"
#include "llama_internal.h"
#include "ggml.h" #include "ggml.h"
@ -827,7 +826,9 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16"; case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
default: LLAMA_ASSERT(false); case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
return "mostly Q4_1, some F16";
default: return "unknown, may not work";
} }
} }

12
llama.h
View file

@ -71,6 +71,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
}; };
LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API struct llama_context_params llama_context_default_params();
@ -178,4 +179,15 @@ extern "C" {
} }
#endif #endif
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL
#include <vector>
#include <string>
struct ggml_tensor;
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
#endif
#endif // LLAMA_H #endif // LLAMA_H

View file

@ -1,12 +0,0 @@
// Internal header to be included by llama.cpp and tests/benchmarks only.
#ifndef LLAMA_INTERNAL_H
#define LLAMA_INTERNAL_H
#include <vector>
#include <string>
struct ggml_tensor;
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
#endif // LLAMA_INTERNAL_H