Merge branch 'ggerganov:master' into master

This commit is contained in:
R.Kaufmann 2023-03-25 21:23:56 +01:00 committed by GitHub
commit 098eb922b8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
28 changed files with 790 additions and 973 deletions

1
.gitignore vendored
View file

@ -19,6 +19,7 @@ models/*
/main /main
/quantize /quantize
/result /result
/perplexity
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json

View file

@ -211,17 +211,6 @@ endif()
# Build libraries # Build libraries
# #
add_library(utils OBJECT
utils.cpp
utils.h)
target_include_directories(utils PUBLIC .)
target_compile_features(utils PUBLIC cxx_std_11) # don't bump
target_link_libraries(utils PRIVATE ${LLAMA_EXTRA_LIBS})
if (BUILD_SHARED_LIBS)
set_target_properties(utils PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
add_library(ggml OBJECT add_library(ggml OBJECT
ggml.c ggml.c
ggml.h) ggml.h)
@ -239,22 +228,12 @@ add_library(llama
target_include_directories(llama PUBLIC .) target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump target_compile_features(llama PUBLIC cxx_std_11) # don't bump
target_link_libraries(llama PRIVATE utils ggml ${LLAMA_EXTRA_LIBS}) target_link_libraries(llama PRIVATE ggml ${LLAMA_EXTRA_LIBS})
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD) target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
endif() endif()
#
# Executables
#
add_executable(main main.cpp)
target_link_libraries(main PRIVATE llama ggml utils)
add_executable(quantize quantize.cpp)
target_link_libraries(quantize PRIVATE llama ggml utils)
# #
# programs, examples and tests # programs, examples and tests
# #
@ -264,6 +243,6 @@ if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
add_subdirectory(tests) add_subdirectory(tests)
endif () endif ()
#if (LLAMA_BUILD_EXAMPLES) if (LLAMA_BUILD_EXAMPLES)
# add_subdirectory(examples) add_subdirectory(examples)
#endif() endif()

View file

@ -212,7 +212,7 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV)) $(info I CXX: $(CXXV))
$(info ) $(info )
default: main quantize default: main quantize perplexity
# #
# Build library # Build library
@ -224,20 +224,23 @@ ggml.o: ggml.c ggml.h
llama.o: llama.cpp llama.h llama.o: llama.cpp llama.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o $(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
utils.o: utils.cpp utils.h common.o: examples/common.cpp examples/common.h
$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
clean: clean:
rm -f *.o main quantize rm -vf *.o main quantize perplexity
main: main.cpp ggml.o llama.o utils.o main: examples/main/main.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@echo @echo
@echo '==== Run ./main -h for help. ====' @echo '==== Run ./main -h for help. ===='
@echo @echo
quantize: quantize.cpp ggml.o llama.o utils.o quantize: examples/quantize/quantize.cpp ggml.o llama.o
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
# #
# Tests # Tests

View file

@ -179,7 +179,10 @@ Here is an example few-shot interaction, invoked with the command
```bash ```bash
# default arguments using 7B model # default arguments using 7B model
./chat.sh ./examples/chat.sh
# advanced chat with 13B model
./examples/chat-13B.sh
# custom arguments using 13B model # custom arguments using 13B model
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt ./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
@ -195,7 +198,7 @@ Note the use of `--color` to distinguish between user input and generated text.
2. Run the `main` tool like this: 2. Run the `main` tool like this:
``` ```
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins ./examples/alpaca.sh
``` ```
Sample run: Sample run:

View file

@ -1,6 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
./main -m ./models/7B/ggml-model-q4_0.bin -b 128 -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt

36
examples/CMakeLists.txt Normal file
View file

@ -0,0 +1,36 @@
# dependencies
find_package(Threads REQUIRED)
# third-party
# ...
# common
set(TARGET common)
add_library(${TARGET} OBJECT
common.h
common.cpp
)
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE llama)
# examples
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
if (EMSCRIPTEN)
else()
add_subdirectory(main)
add_subdirectory(quantize)
add_subdirectory(perplexity)
add_subdirectory(embedding)
endif()

View file

@ -1,6 +1,10 @@
#!/bin/bash #!/bin/bash
# #
# Temporary script - will be removed in the future # Temporary script - will be removed in the future
# #
cd `dirname $0`
cd ..
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 ./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins -b 256 --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7

16
examples/chat.sh Executable file
View file

@ -0,0 +1,16 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
# Important:
#
# "--keep 48" is based on the contents of prompts/chat-with-bob.txt
#
./main -m ./models/7B/ggml-model-q4_0.bin -c 512 -b 1024 -n 256 --keep 48 \
--repeat_penalty 1.0 --color -i \
-r "User:" -f prompts/chat-with-bob.txt

View file

@ -1,6 +1,6 @@
#include "ggml.h" #include "common.h"
#include "utils.h" #include "ggml.h"
#include <cassert> #include <cassert>
#include <cstring> #include <cstring>
@ -112,6 +112,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} }
params.n_batch = std::stoi(argv[i]); params.n_batch = std::stoi(argv[i]);
params.n_batch = std::min(512, params.n_batch); params.n_batch = std::min(512, params.n_batch);
} else if (arg == "--keep") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_keep = std::stoi(argv[i]);
} else if (arg == "-m" || arg == "--model") { } else if (arg == "-m" || arg == "--model") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -134,7 +140,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.use_mlock = true; params.use_mlock = true;
} else if (arg == "--mtest") { } else if (arg == "--mtest") {
params.mem_test = true; params.mem_test = true;
} else if (arg == "--verbose_prompt") { } else if (arg == "--verbose-prompt") {
params.verbose_prompt = true; params.verbose_prompt = true;
} else if (arg == "-r" || arg == "--reverse-prompt") { } else if (arg == "-r" || arg == "--reverse-prompt") {
if (++i >= argc) { if (++i >= argc) {
@ -198,7 +204,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n"); fprintf(stderr, " -f FNAME, --file FNAME\n");
fprintf(stderr, " prompt file to start generation.\n"); fprintf(stderr, " prompt file to start generation.\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 - infinity)\n", params.n_predict);
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n); fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
@ -210,6 +216,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n"); fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " --keep number of tokens to keep from the initial prompt\n");
if (ggml_mlock_supported()) { if (ggml_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
} }

View file

@ -21,6 +21,7 @@ struct gpt_params {
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions) int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; // context size int32_t n_ctx = 512; // context size
int32_t n_batch = 8; // batch size for prompt processing int32_t n_batch = 8; // batch size for prompt processing
int32_t n_keep = 0; // number of tokens to keep from initial prompt
// sampling parameters // sampling parameters
int32_t top_k = 40; int32_t top_k = 40;

View file

@ -0,0 +1,4 @@
set(TARGET embedding)
add_executable(${TARGET} embedding.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,3 @@
# embedding
TODO

View file

@ -0,0 +1,101 @@
#include "common.h"
#include "llama.h"
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
params.embedding = true;
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
int n_past = 0;
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
// tokenize the prompt
auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);
// determine newline token
auto llama_token_newline = ::llama_tokenize(ctx, "\n", false);
if (params.verbose_prompt) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
}
fprintf(stderr, "\n");
}
if (params.embedding){
if (embd_inp.size() > 0) {
if (llama_eval(ctx, embd_inp.data(), embd_inp.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
const int n_embd = llama_n_embd(ctx);
const auto embeddings = llama_get_embeddings(ctx);
for (int i = 0; i < n_embd; i++) {
printf("%f ", embeddings[i]);
}
printf("\n");
}
llama_print_timings(ctx);
llama_free(ctx);
return 0;
}

View file

@ -0,0 +1,4 @@
set(TARGET main)
add_executable(${TARGET} main.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

3
examples/main/README.md Normal file
View file

@ -0,0 +1,3 @@
# main
TODO

View file

@ -1,5 +1,4 @@
#include "utils.h" #include "common.h"
#include "ggml.h"
#include "llama.h" #include "llama.h"
#include <cassert> #include <cassert>
@ -45,8 +44,18 @@ enum console_state {
static console_state con_st = CONSOLE_STATE_DEFAULT; static console_state con_st = CONSOLE_STATE_DEFAULT;
static bool con_use_color = false; static bool con_use_color = false;
void set_console_state(console_state new_st) void enable_console_colors() {
{ #if defined (_WIN32)
// Enable ANSI colors on Windows 10+
unsigned long dwMode = 0;
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
}
#endif
}
void set_console_state(console_state new_st) {
if (!con_use_color) return; if (!con_use_color) return;
// only emit color code if state changed // only emit color code if state changed
if (new_st != con_st) { if (new_st != con_st) {
@ -65,79 +74,6 @@ void set_console_state(console_state new_st)
} }
} }
std::vector<double> softmax(const std::vector<float>& logits) {
std::vector<double> probs(logits.size());
float max_logit = logits[0];
for (float v : logits) max_logit = std::max(max_logit, v);
double sum_exp = 0.0;
for (size_t i = 0; i < logits.size(); i++) {
// Subtract the maximum logit value from the current logit value for numerical stability
float logit = logits[i] - max_logit;
double exp_logit = std::exp(logit);
sum_exp += exp_logit;
probs[i] = exp_logit;
}
for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
return probs;
}
void perplexity(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
int count = 0;
double nll = 0.0;
int seq_count = tokens.size() / params.n_ctx;
fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
for (int i = 0; i < seq_count; ++i) {
int start = i * params.n_ctx;
int end = start + params.n_ctx - 1;
std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
auto start_t = std::chrono::high_resolution_clock::now();
if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
auto end_t = std::chrono::high_resolution_clock::now();
if (i == 0) {
double seconds = std::chrono::duration<double>(end_t - start_t).count();
printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
}
// We get the logits for all the tokens in the context window (params.n_ctx)
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
// calculate the perplexity over the last half the window (so the model always has
// some context to predict the token).
//
// We rely on the fact that attention in the forward pass only looks at previous
// tokens here, so the logits returned for each token are an accurate representation
// of what the model would have predicted at that point.
//
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
auto logits = llama_get_logits(ctx);
for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
int n_vocab = llama_n_vocab(ctx);
std::vector<float> tok_logits(
logits + j * n_vocab,
logits + (j + 1) * n_vocab);
double prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob);
++count;
}
// perplexity is e^(average negative log-likelihood)
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
}
printf("\n");
}
static bool is_interacting = false; static bool is_interacting = false;
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
@ -155,9 +91,6 @@ void sigint_handler(int signo) {
#endif #endif
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
// has to be called once at the start of the program to init ggml stuff
ggml_time_init();
gpt_params params; gpt_params params;
params.model = "models/llama-7B/ggml-model.bin"; params.model = "models/llama-7B/ggml-model.bin";
@ -165,6 +98,22 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
if (params.perplexity) {
printf("\n************\n");
printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.embedding) {
printf("\n************\n");
printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__);
printf("************\n\n");
return 0;
}
if (params.n_ctx > 2048) { if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);" fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx); "expect poor results\n", __func__, params.n_ctx);
@ -198,9 +147,7 @@ int main(int argc, char ** argv) {
lparams.n_parts = params.n_parts; lparams.n_parts = params.n_parts;
lparams.seed = params.seed; lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16; lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock; lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams); ctx = llama_init_from_file(params.model.c_str(), lparams);
@ -236,13 +183,6 @@ int main(int argc, char ** argv) {
return 0; return 0;
} }
if (params.perplexity) {
perplexity(ctx, params);
exit(0);
}
int n_past = 0;
// Add a space in front of the first character to match OG llama tokenizer behavior // Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' '); params.prompt.insert(0, 1, ' ');
@ -251,7 +191,12 @@ int main(int argc, char ** argv) {
const int n_ctx = llama_n_ctx(ctx); const int n_ctx = llama_n_ctx(ctx);
params.n_predict = std::min(params.n_predict, n_ctx - (int) embd_inp.size()); if ((int) embd_inp.size() > n_ctx - 4) {
fprintf(stderr, "%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
}
params.n_keep = std::min(params.n_keep, (int) embd_inp.size());
// prefix & suffix for instruct mode // prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
@ -282,6 +227,13 @@ int main(int argc, char ** argv) {
for (int i = 0; i < (int) embd_inp.size(); i++) { for (int i = 0; i < (int) embd_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i])); fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], llama_token_to_str(ctx, embd_inp[i]));
} }
if (params.n_keep > 0) {
fprintf(stderr, "%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
fprintf(stderr, "%s", llama_token_to_str(ctx, embd_inp[i]));
}
fprintf(stderr, "'\n");
}
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
@ -308,14 +260,12 @@ int main(int argc, char ** argv) {
fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
} }
} }
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
fprintf(stderr, "\n\n"); fprintf(stderr, "\n\n");
std::vector<llama_token> embd; // TODO: replace with ring-buffer
std::vector<llama_token> last_n_tokens(n_ctx);
int last_n_size = params.repeat_last_n;
std::vector<llama_token> last_n_tokens(last_n_size);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
if (params.interactive) { if (params.interactive) {
@ -328,48 +278,44 @@ int main(int argc, char ** argv) {
is_interacting = params.interactive_start || params.instruct; is_interacting = params.interactive_start || params.instruct;
} }
int input_consumed = 0;
bool input_noecho = false; bool input_noecho = false;
int remaining_tokens = params.n_predict; int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
#if defined (_WIN32)
if (params.use_color) {
// Enable ANSI colors on Windows 10+
unsigned long dwMode = 0;
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
}
}
#endif
// the first thing we will do is to output the prompt, so set color accordingly // the first thing we will do is to output the prompt, so set color accordingly
if (params.use_color) {
enable_console_colors();
}
set_console_state(CONSOLE_STATE_PROMPT); set_console_state(CONSOLE_STATE_PROMPT);
if (params.embedding){ std::vector<llama_token> embd;
embd = embd_inp;
if (embd.size() > 0) { while (n_remain != 0 || params.interactive) {
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
const auto embeddings = llama_get_embeddings(ctx);
// TODO: print / use the embeddings
if (params.use_color) {
printf(ANSI_COLOR_RESET);
}
return 0;
}
while (remaining_tokens > 0 || params.interactive) {
// predict // predict
if (embd.size() > 0) { if (embd.size() > 0) {
// infinite text generation via context swapping
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
if (n_past + (int) embd.size() > n_ctx) {
const int n_left = n_past - params.n_keep;
n_past = params.n_keep;
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_str(ctx, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
}
if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) { if (llama_eval(ctx, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__); fprintf(stderr, "%s : failed to eval\n", __func__);
return 1; return 1;
@ -379,7 +325,7 @@ int main(int argc, char ** argv) {
n_past += embd.size(); n_past += embd.size();
embd.clear(); embd.clear();
if ((int) embd_inp.size() <= input_consumed && !is_interacting) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
// out of user input, sample next token // out of user input, sample next token
const float top_k = params.top_k; const float top_k = params.top_k;
const float top_p = params.top_p; const float top_p = params.top_p;
@ -392,14 +338,12 @@ int main(int argc, char ** argv) {
auto logits = llama_get_logits(ctx); auto logits = llama_get_logits(ctx);
if (params.ignore_eos) { if (params.ignore_eos) {
// set the logit of the eos token to zero to avoid sampling it
//logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
// TODO: this does not work of params.logits_all == true
assert(params.perplexity == false);
logits[llama_token_eos()] = 0; logits[llama_token_eos()] = 0;
} }
id = llama_sample_top_p_top_k(ctx, last_n_tokens.data(), last_n_tokens.size(), top_k, top_p, temp, repeat_penalty); id = llama_sample_top_p_top_k(ctx,
last_n_tokens.data() + n_ctx - params.repeat_last_n,
params.repeat_last_n, top_k, top_p, temp, repeat_penalty);
last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id); last_n_tokens.push_back(id);
@ -422,14 +366,14 @@ int main(int argc, char ** argv) {
input_noecho = false; input_noecho = false;
// decrement remaining sampling budget // decrement remaining sampling budget
--remaining_tokens; --n_remain;
} else { } else {
// some user input remains from prompt or interaction, forward it to processing // some user input remains from prompt or interaction, forward it to processing
while ((int) embd_inp.size() > input_consumed) { while ((int) embd_inp.size() > n_consumed) {
embd.push_back(embd_inp[input_consumed]); embd.push_back(embd_inp[n_consumed]);
last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[input_consumed]); last_n_tokens.push_back(embd_inp[n_consumed]);
++input_consumed; ++n_consumed;
if ((int) embd.size() >= params.n_batch) { if ((int) embd.size() >= params.n_batch) {
break; break;
} }
@ -444,13 +388,13 @@ int main(int argc, char ** argv) {
fflush(stdout); fflush(stdout);
} }
// reset color to default if we there is no pending user input // reset color to default if we there is no pending user input
if (!input_noecho && (int)embd_inp.size() == input_consumed) { if (!input_noecho && (int)embd_inp.size() == n_consumed) {
set_console_state(CONSOLE_STATE_DEFAULT); set_console_state(CONSOLE_STATE_DEFAULT);
} }
// in interactive mode, and not currently processing queued inputs; // in interactive mode, and not currently processing queued inputs;
// check if we should prompt the user for more // check if we should prompt the user for more
if (params.interactive && (int) embd_inp.size() <= input_consumed) { if (params.interactive && (int) embd_inp.size() <= n_consumed) {
// check for reverse prompt // check for reverse prompt
std::string last_output; std::string last_output;
for (auto id : last_n_tokens) { for (auto id : last_n_tokens) {
@ -472,7 +416,7 @@ int main(int argc, char ** argv) {
set_console_state(CONSOLE_STATE_USER_INPUT); set_console_state(CONSOLE_STATE_USER_INPUT);
if (params.instruct) { if (params.instruct) {
input_consumed = embd_inp.size(); n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
printf("\n> "); printf("\n> ");
@ -506,7 +450,7 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
} }
remaining_tokens -= line_inp.size(); n_remain -= line_inp.size();
input_noecho = true; // do not echo this again input_noecho = true; // do not echo this again
} }
@ -527,8 +471,8 @@ int main(int argc, char ** argv) {
} }
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached. // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
if (params.interactive && remaining_tokens <= 0) { if (params.interactive && n_remain <= 0) {
remaining_tokens = params.n_predict; n_remain = params.n_predict;
is_interacting = true; is_interacting = true;
} }
} }

View file

@ -0,0 +1,4 @@
set(TARGET perplexity)
add_executable(${TARGET} perplexity.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,3 @@
# perplexity
TODO

View file

@ -0,0 +1,138 @@
#include "common.h"
#include "llama.h"
std::vector<double> softmax(const std::vector<float>& logits) {
std::vector<double> probs(logits.size());
float max_logit = logits[0];
for (float v : logits) max_logit = std::max(max_logit, v);
double sum_exp = 0.0;
for (size_t i = 0; i < logits.size(); i++) {
// Subtract the maximum logit value from the current logit value for numerical stability
float logit = logits[i] - max_logit;
double exp_logit = std::exp(logit);
sum_exp += exp_logit;
probs[i] = exp_logit;
}
for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
return probs;
}
void perplexity(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]`
auto tokens = ::llama_tokenize(ctx, params.prompt, true);
int count = 0;
double nll = 0.0;
int seq_count = tokens.size() / params.n_ctx;
fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count);
for (int i = 0; i < seq_count; ++i) {
int start = i * params.n_ctx;
int end = start + params.n_ctx - 1;
std::vector<llama_token> embd(tokens.begin() + start, tokens.begin() + end);
auto start_t = std::chrono::high_resolution_clock::now();
if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return;
}
auto end_t = std::chrono::high_resolution_clock::now();
if (i == 0) {
double seconds = std::chrono::duration<double>(end_t - start_t).count();
printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
}
// We get the logits for all the tokens in the context window (params.n_ctx)
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
// calculate the perplexity over the last half the window (so the model always has
// some context to predict the token).
//
// We rely on the fact that attention in the forward pass only looks at previous
// tokens here, so the logits returned for each token are an accurate representation
// of what the model would have predicted at that point.
//
// Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt.
auto logits = llama_get_logits(ctx);
for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
// Calculate probability of next token, given the previous ones.
int n_vocab = llama_n_vocab(ctx);
std::vector<float> tok_logits(
logits + j * n_vocab,
logits + (j + 1) * n_vocab);
double prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob);
++count;
}
// perplexity is e^(average negative log-likelihood)
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
fflush(stdout);
}
printf("\n");
}
int main(int argc, char ** argv) {
gpt_params params;
params.model = "models/llama-7B/ggml-model.bin";
if (gpt_params_parse(argc, argv, params) == false) {
return 1;
}
params.perplexity = true;
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
"expect poor results\n", __func__, params.n_ctx);
}
if (params.seed <= 0) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng);
}
llama_context * ctx;
// load the model
{
auto lparams = llama_context_default_params();
lparams.n_ctx = params.n_ctx;
lparams.n_parts = params.n_parts;
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;
ctx = llama_init_from_file(params.model.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
return 1;
}
}
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
perplexity(ctx, params);
llama_print_timings(ctx);
llama_free(ctx);
return 0;
}

View file

@ -0,0 +1,4 @@
set(TARGET quantize)
add_executable(${TARGET} quantize.cpp)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,3 @@
# quantize
TODO

669
ggml.c
View file

@ -496,7 +496,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
assert(k % QK == 0); assert(k % QK == 0);
#if __ARM_NEON || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__) #if defined(__ARM_NEON) || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__)
const int nb = k / QK; const int nb = k / QK;
const size_t bs = sizeof(float) + QK/2; const size_t bs = sizeof(float) + QK/2;
@ -507,7 +507,6 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
#endif #endif
#if defined(__POWER9_VECTOR__) #if defined(__POWER9_VECTOR__)
#if QK == 32
const vector float v85 = vec_splats(8.5f); const vector float v85 = vec_splats(8.5f);
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max float amax = 0.0f; // absolute max
@ -548,11 +547,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
//memcpy(pb, pp, sizeof(pp)); //memcpy(pb, pp, sizeof(pp));
pb += bs; pb += bs;
} }
#else
#error "not implemented for QK"
#endif
#elif __ARM_NEON #elif __ARM_NEON
#if QK == 32
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max float amax = 0.0f; // absolute max
@ -589,11 +584,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
memcpy(pb, pp, sizeof(pp)); memcpy(pb, pp, sizeof(pp));
pb += bs; pb += bs;
} }
#else
#error "not implemented for QK"
#endif
#elif defined(__AVX2__) #elif defined(__AVX2__)
#if QK == 32
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
// Load elements into 4 AVX vectors // Load elements into 4 AVX vectors
__m256 v0 = _mm256_loadu_ps( x ); __m256 v0 = _mm256_loadu_ps( x );
@ -660,11 +651,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
_mm_storeu_si128( ( __m128i* )pb, res ); _mm_storeu_si128( ( __m128i* )pb, res );
pb += bs; pb += bs;
} }
#else
#error "not implemented for QK"
#endif
#elif defined(__wasm_simd128__) #elif defined(__wasm_simd128__)
#if QK == 32
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max float amax = 0.0f; // absolute max
@ -701,9 +688,6 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
memcpy(pb, pp, sizeof(pp)); memcpy(pb, pp, sizeof(pp));
pb += bs; pb += bs;
} }
#else
#error "not implemented for QK"
#endif
#else #else
// scalar // scalar
quantize_row_q4_0_reference(x, y, k); quantize_row_q4_0_reference(x, y, k);
@ -771,7 +755,7 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float)); const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float));
#if defined(__AVX2__) && QK % 32 == 0 #if defined(__AVX2__)
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
// scale factor // scale factor
const __m256 d_v = _mm256_broadcast_ss((const float *) (pd + i*bs)); const __m256 d_v = _mm256_broadcast_ss((const float *) (pd + i*bs));
@ -799,11 +783,64 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {
// Scale and store // Scale and store
for (int j = 0; j < 4; j++) { for (int j = 0; j < 4; j++) {
__m256 result = _mm256_mul_ps(vf[j], d_v); const __m256 result = _mm256_mul_ps(vf[j], d_v);
_mm256_storeu_ps(y + i * QK + l + j*8, result); _mm256_storeu_ps(y + i * QK + l + j*8, result);
} }
} }
} }
#elif defined(__ARM_NEON)
for (int i = 0; i < nb; i++) {
const float d = *(const float *) (pd + i*bs);
const uint8_t * restrict pp = pb + i*bs;
const float32x4_t vd = vdupq_n_f32(d);
for (int l = 0; l < QK; l += 16) {
// Load 16x4-bit integers into 8x8-bit integers
const uint8x8_t v8 = vld1_u8(pp + l/2);
// Expand 4-bit nibbles to 8-bit bytes
const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f));
const uint8x8_t v1 = vshr_n_u8(v8, 4);
// Convert to signed 8-bit integers
const int8x8_t vs_0 = vreinterpret_s8_u8(v0);
const int8x8_t vs_1 = vreinterpret_s8_u8(v1);
// Subtract 8 from each byte
const int8x8_t vb_0 = vsub_s8(vs_0, vdup_n_s8(8));
const int8x8_t vb_1 = vsub_s8(vs_1, vdup_n_s8(8));
// Interleave and combine
const int8x8_t vx_0 = vzip1_s8(vb_0, vb_1);
const int8x8_t vx_1 = vzip2_s8(vb_0, vb_1);
const int8x16_t vq = vcombine_s8(vx_0, vx_1);
// convert to 2x int16x8_t
const int16x8_t vi_0 = vmovl_s8(vget_low_s8 (vq));
const int16x8_t vi_1 = vmovl_s8(vget_high_s8(vq));
// convert to 4x float32x4_t
const float32x4_t vf_0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_0)));
const float32x4_t vf_1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_0)));
const float32x4_t vf_2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vi_1)));
const float32x4_t vf_3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vi_1)));
// Multiply by d
const float32x4_t r0 = vmulq_f32(vf_0, vd);
const float32x4_t r1 = vmulq_f32(vf_1, vd);
const float32x4_t r2 = vmulq_f32(vf_2, vd);
const float32x4_t r3 = vmulq_f32(vf_3, vd);
// Store
vst1q_f32(y + i*QK + l + 0, r0);
vst1q_f32(y + i*QK + l + 4, r1);
vst1q_f32(y + i*QK + l + 8, r2);
vst1q_f32(y + i*QK + l + 12, r3);
}
}
#else #else
// scalar // scalar
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
@ -842,6 +879,37 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
#if defined(__AVX2__)
for (int i = 0; i < nb; i++) {
const __m256 d_v = _mm256_broadcast_ss((const float *) (pd + i*bs));
const __m256 d_m = _mm256_broadcast_ss((const float *) (pm + i*bs));
const uint8_t * restrict pp = pb + i*bs;
for (int l = 0; l < QK; l += 32) {
// Load 32x4-bit integers into 32x8-bit integers
__m256i vx8 = bytesFromNibbles(pp+l/2);
// Convert to 16-bit int
const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
const __m256i vx16_hi = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 1));
// Convert to 32-bit int -> float 32
const __m256 vf[4] = {
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 0))),
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_lo, 1))),
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 0))),
_mm256_cvtepi32_ps(_mm256_cvtepi16_epi32(_mm256_extracti128_si256(vx16_hi, 1)))
};
// Scale, add m and store
for (int j = 0; j < 4; j++) {
const __m256 result = _mm256_add_ps(_mm256_mul_ps(vf[j], d_v), d_m);
_mm256_storeu_ps(y + i * QK + l + j*8, result);
}
}
}
#else
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
const float d = *(const float *) (pd + i*bs); const float d = *(const float *) (pd + i*bs);
const float m = *(const float *) (pm + i*bs); const float m = *(const float *) (pm + i*bs);
@ -864,6 +932,7 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
assert(!isnan(y[i*QK + l + 1])); assert(!isnan(y[i*QK + l + 1]));
} }
} }
#endif
} }
// //
@ -1500,8 +1569,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
float sumf = 0.0; float sumf = 0.0;
#ifdef __ARM_NEON #if defined(__ARM_NEON)
#if QK == 32
float sum0 = 0.0f; float sum0 = 0.0f;
float sum1 = 0.0f; float sum1 = 0.0f;
@ -1600,12 +1668,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
} }
sumf = sum0 + sum1; sumf = sum0 + sum1;
#else
#error "not implemented for QK"
#endif
#elif defined(__AVX512F__) #elif defined(__AVX512F__)
#if QK == 32
// Initialize accumulator with zeros // Initialize accumulator with zeros
__m512 acc0 = _mm512_setzero_ps(); __m512 acc0 = _mm512_setzero_ps();
__m512 acc1 = _mm512_setzero_ps(); __m512 acc1 = _mm512_setzero_ps();
@ -1634,11 +1697,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
// Horizontal sum of all lanes of the accumulator // Horizontal sum of all lanes of the accumulator
sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 ); sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 );
#else
#error "not implemented for QK"
#endif
#elif defined(__AVX2__) #elif defined(__AVX2__)
#if QK == 32
const size_t countBlocks = nb; const size_t countBlocks = nb;
// Initialize accumulator with zeros // Initialize accumulator with zeros
@ -1689,11 +1748,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
sumf = _mm_cvtss_f32( res ); sumf = _mm_cvtss_f32( res );
#else
#error "not implemented for QK"
#endif
#elif defined(__wasm_simd128__) #elif defined(__wasm_simd128__)
#if QK == 32
// wasm simd // wasm simd
float sum0 = 0.0f; float sum0 = 0.0f;
float sum1 = 0.0f; float sum1 = 0.0f;
@ -1776,9 +1831,6 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
} }
sumf = sum0 + sum1; sumf = sum0 + sum1;
#else
#error "not implemented for QK"
#endif
#else #else
// scalar // scalar
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
@ -1823,7 +1875,6 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void
float sumf = 0.0; float sumf = 0.0;
#if defined(__AVX2__) #if defined(__AVX2__)
#if QK == 32
// Initialize accumulator with zeros // Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps(); __m256 acc = _mm256_setzero_ps();
// Accumulator for constant offsets // Accumulator for constant offsets
@ -1898,9 +1949,6 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void
res = _mm_add_ss( res, _mm_movehdup_ps( res ) ); res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
sumf = _mm_cvtss_f32( res ) + acc_offset * QK; sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
#else
#error "not implemented for QK"
#endif
#else #else
// scalar // scalar
for (int i = 0; i < nb; i++) { for (int i = 0; i < nb; i++) {
@ -2017,167 +2065,6 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float
#endif #endif
} }
inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) {
#if defined(GGML_SIMD)
const int np = (n & ~(GGML_F16_STEP - 1));
GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);
GGML_F16_VEC ax[GGML_F16_ARR];
GGML_F16_VEC ay[GGML_F16_ARR];
for (int i = 0; i < np; i += GGML_F16_STEP) {
for (int j = 0; j < GGML_F16_ARR; j++) {
ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);
ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);
ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);
GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);
}
}
// leftovers
for (int i = np; i < n; ++i) {
GGML_ASSERT(false);
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#else
for (int i = 0; i < n; ++i) {
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
}
#endif
}
inline static void ggml_vec_mad_q4_0(const int n, float * restrict y, void * restrict x, const float v) {
assert(n % QK == 0);
const int nb = n / QK;
const size_t bs = sizeof(float) + QK/2;
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float));
#if __ARM_NEON
#if QK == 32
for (int i = 0; i < nb; ++i) {
const float d0 = v*(*(const float *) (pd + i*bs));
const uint8_t * restrict pp = pb + i*bs;
const uint8x8_t m4b = vdup_n_u8(0xf);
const int8x8_t s8b = vdup_n_s8(0x8);
const float32x4_t vd = vdupq_n_f32(d0);
for (int j = 0; j < 2; j++) {
const uint8x8_t vx = vld1_u8(pp + j*8);
const int8x8_t vxl = vreinterpret_s8_u8(vand_u8(vx, m4b));
const int8x8_t vxh = vreinterpret_s8_u8(vshr_n_u8(vx, 4));
// sub 8
const int8x8_t vxls = vsub_s8(vxl, s8b);
const int8x8_t vxhs = vsub_s8(vxh, s8b);
//const int8x8_t vxlt = vzip_s8(vxls, vxhs)[0];
//const int8x8_t vxht = vzip_s8(vxls, vxhs)[1];
const int8x8_t vxlt = vzip1_s8(vxls, vxhs);
const int8x8_t vxht = vzip2_s8(vxls, vxhs);
const int8x16_t vxq = vcombine_s8(vxlt, vxht);
// convert to 2x int16x8_t
const int16x8_t vxq0 = vmovl_s8(vget_low_s8 (vxq));
const int16x8_t vxq1 = vmovl_s8(vget_high_s8(vxq));
// convert to 4x float32x4_t
const float32x4_t vx0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq0)));
const float32x4_t vx1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq0)));
const float32x4_t vx2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq1)));
const float32x4_t vx3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq1)));
const float32x4_t vy0 = vld1q_f32(y + i*32 + j*16 + 0);
const float32x4_t vy1 = vld1q_f32(y + i*32 + j*16 + 4);
const float32x4_t vy2 = vld1q_f32(y + i*32 + j*16 + 8);
const float32x4_t vy3 = vld1q_f32(y + i*32 + j*16 + 12);
const float32x4_t vr0 = vfmaq_f32(vy0, vx0, vd);
const float32x4_t vr1 = vfmaq_f32(vy1, vx1, vd);
const float32x4_t vr2 = vfmaq_f32(vy2, vx2, vd);
const float32x4_t vr3 = vfmaq_f32(vy3, vx3, vd);
vst1q_f32(y + i*32 + j*16 + 0, vr0);
vst1q_f32(y + i*32 + j*16 + 4, vr1);
vst1q_f32(y + i*32 + j*16 + 8, vr2);
vst1q_f32(y + i*32 + j*16 + 12, vr3);
}
}
#endif
#else
// scalar
for (int i = 0; i < nb; i++) {
const float d = *(const float *) (pd + i*bs);
const uint8_t * restrict pp = pb + i*bs;
for (int l = 0; l < QK; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;
const float v0 = (vi0 - 8)*d;
const float v1 = (vi1 - 8)*d;
y[i*QK + l + 0] += v0*v;
y[i*QK + l + 1] += v1*v;
assert(!isnan(y[i*QK + l + 0]));
assert(!isnan(y[i*QK + l + 1]));
assert(!isinf(y[i*QK + l + 0]));
assert(!isinf(y[i*QK + l + 1]));
}
}
#endif
}
inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * restrict x, const float v) {
assert(n % QK == 0);
const int nb = n / QK;
const size_t bs = 2*sizeof(float) + QK/2;
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
for (int i = 0; i < nb; i++) {
const float d = *(const float *) (pd + i*bs);
const float m = *(const float *) (pm + i*bs);
const uint8_t * restrict pp = pb + i*bs;
for (int l = 0; l < QK; l += 2) {
const uint8_t vi = pp[l/2];
const uint8_t vi0 = vi & 0xf;
const uint8_t vi1 = vi >> 4;
const float v0 = d*vi0 + m;
const float v1 = d*vi1 + m;
y[i*QK + l + 0] += v0*v;
y[i*QK + l + 1] += v1*v;
assert(!isnan(y[i*QK + l + 0]));
assert(!isnan(y[i*QK + l + 1]));
assert(!isinf(y[i*QK + l + 0]));
assert(!isinf(y[i*QK + l + 1]));
//printf("mad: v0 %f v1 %f, i = %d, l = %d, d = %f, vi = %d, vi0 = %d, vi1 = %d\n", v0, v1, i, l, d, vi, vi0, vi1);
}
}
}
//inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; } //inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { for (int i = 0; i < n; ++i) y[i] *= v; }
inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
#if defined(GGML_SIMD) #if defined(GGML_SIMD)
@ -2617,6 +2504,10 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct
(t0->ne[3] == t1->ne[3]); (t0->ne[3] == t1->ne[3]);
} }
static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
return tensor->nb[0] > tensor->nb[1];
}
static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) { static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
@ -4010,6 +3901,7 @@ struct ggml_tensor * ggml_mul_mat(
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b) { struct ggml_tensor * b) {
GGML_ASSERT(ggml_can_mul_mat(a, b)); GGML_ASSERT(ggml_can_mul_mat(a, b));
GGML_ASSERT(!ggml_is_transposed(a));
bool is_node = false; bool is_node = false;
@ -5881,8 +5773,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
const int ne00 = src0->ne[0]; //const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1]; //const int ne01 = src0->ne[1];
const int ne10 = src1->ne[0]; const int ne10 = src1->ne[0];
@ -5916,16 +5808,16 @@ static void ggml_compute_forward_mul_mat_f32(
const int ne10 = src1->ne[0]; const int ne10 = src1->ne[0];
const int ne11 = src1->ne[1]; const int ne11 = src1->ne[1];
const int ne12 = src1->ne[2]; //const int ne12 = src1->ne[2];
const int ne13 = src1->ne[3]; //const int ne13 = src1->ne[3];
const int ne0 = dst->ne[0]; //const int ne0 = dst->ne[0];
const int ne1 = dst->ne[1]; //const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2]; //const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3]; //const int ne3 = dst->ne[3];
const int ne = ne0*ne1*ne2*ne3; //const int ne = ne0*ne1*ne2*ne3;
const int nb00 = src0->nb[0]; //const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1]; const int nb01 = src0->nb[1];
const int nb02 = src0->nb[2]; const int nb02 = src0->nb[2];
const int nb03 = src0->nb[3]; const int nb03 = src0->nb[3];
@ -5949,7 +5841,7 @@ static void ggml_compute_forward_mul_mat_f32(
assert(ne3 == ne13); assert(ne3 == ne13);
// TODO: we don't support permuted src0 // TODO: we don't support permuted src0
assert(nb00 == sizeof(float) || nb01 == sizeof(float)); assert(nb00 == sizeof(float));
// dst cannot be transposed or permuted // dst cannot be transposed or permuted
assert(nb0 == sizeof(float)); assert(nb0 == sizeof(float));
@ -5964,9 +5856,6 @@ static void ggml_compute_forward_mul_mat_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
//
// nb00 < nb01 - src0 is transposed
// compute by src0 columns
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
@ -6007,42 +5896,13 @@ static void ggml_compute_forward_mul_mat_f32(
#endif #endif
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (nb01 >= nb00) {
return;
}
// TODO: fix this memset (wsize is overestimated)
memset(params->wdata, 0, params->wsize);
return; return;
} }
if (params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_FINALIZE) {
if (nb01 >= nb00) {
return; return;
} }
// TODO: fix this memset (wsize is overestimated)
//assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth);
float * const wdata = params->wdata;
// cols per thread
const int dc = (ne + nth - 1)/nth;
// col range for this thread
const int ic0 = dc*ith;
const int ic1 = MIN(ic0 + dc, ne);
ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0);
for (int k = 1; k < nth; k++) {
ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0);
}
return;
}
if (nb01 >= nb00) {
// TODO: do not support transposed src1 // TODO: do not support transposed src1
assert(nb10 == sizeof(float)); assert(nb10 == sizeof(float));
@ -6082,53 +5942,6 @@ static void ggml_compute_forward_mul_mat_f32(
(float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13))); (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));
} }
} }
} else {
// parallelize by src1 columns using ggml_vec_mad_f32
// each thread has its own work data
// during FINALIZE we accumulate all work data into dst
// total columns in src1
const int nc = ne10;
// columns per thread
const int dc = (nc + nth - 1)/nth;
// column range for this thread
const int ic0 = dc*ith;
const int ic1 = MIN(ic0 + dc, nc);
// work data for thread
const int wo = (ne + CACHE_LINE_SIZE_F32)*ith;
float * const wdata = params->wdata;
for (int i13 = 0; i13 < ne13; ++i13) {
for (int i12 = 0; i12 < ne12; ++i12) {
for (int i11 = 0; i11 < ne11; ++i11) {
for (int ic = ic0; ic < ic1; ++ic) {
// src1 indices
const int i10 = ic;
// src0 indices
const int i03 = i13;
const int i02 = i12;
const int i00 = ic;
// dst indices
const int i1 = i11;
const int i2 = i12;
const int i3 = i13;
assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize);
ggml_vec_mad_f32(ne01,
(float *) (wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0),
(float *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)),
*(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)));
}
}
}
}
}
//int64_t t1 = ggml_perf_time_us(); //int64_t t1 = ggml_perf_time_us();
//static int64_t acc = 0; //static int64_t acc = 0;
@ -6166,7 +5979,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
const int ne1 = dst->ne[1]; const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2]; const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3]; const int ne3 = dst->ne[3];
const int ne = ne0*ne1*ne2*ne3; //const int ne = ne0*ne1*ne2*ne3;
const int nb00 = src0->nb[0]; const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1]; const int nb01 = src0->nb[1];
@ -6192,7 +6005,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
GGML_ASSERT(ne3 == ne13); GGML_ASSERT(ne3 == ne13);
// TODO: we don't support permuted src0 // TODO: we don't support permuted src0
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t) || nb01 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
// dst cannot be transposed or permuted // dst cannot be transposed or permuted
GGML_ASSERT(nb0 == sizeof(float)); GGML_ASSERT(nb0 == sizeof(float));
@ -6207,9 +6020,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
//
// nb00 < nb01 - src0 is transposed
// compute by src0 columns
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
@ -6261,7 +6071,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
#endif #endif
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
if (nb01 >= nb00) {
ggml_fp16_t * const wdata = params->wdata; ggml_fp16_t * const wdata = params->wdata;
size_t id = 0; size_t id = 0;
@ -6280,42 +6089,10 @@ static void ggml_compute_forward_mul_mat_f16_f32(
return; return;
} }
// TODO: fix this memset (wsize is overestimated)
memset(params->wdata, 0, params->wsize);
return;
}
if (params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_FINALIZE) {
if (nb01 >= nb00) {
return; return;
} }
// TODO: fix this memset (wsize is overestimated)
//assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth);
ggml_fp16_t * const wdata = params->wdata;
// cols per thread
const int dc = (ne + nth - 1)/nth;
// col range for this thread
const int ic0 = dc*ith;
const int ic1 = MIN(ic0 + dc, ne);
for (int i = ic0; i < ic1; ++i) {
((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]);
}
for (int k = 1; k < nth; k++) {
for (int i = ic0; i < ic1; ++i) {
((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);
}
}
return;
}
if (nb01 >= nb00) {
// fp16 -> half the size, so divide by 2 // fp16 -> half the size, so divide by 2
// TODO: do not support transposed src1 // TODO: do not support transposed src1
assert(nb10/2 == sizeof(ggml_fp16_t)); assert(nb10/2 == sizeof(ggml_fp16_t));
@ -6356,55 +6133,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00); ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);
} }
} }
} else {
// parallelize by src1 columns using ggml_vec_mad_f16
// each thread has its own work data
// during FINALIZE we accumulate all work data into dst
// total columns in src1
const int nc = ne10;
// columns per thread
const int dc = (nc + nth - 1)/nth;
// column range for this thread
const int ic0 = dc*ith;
const int ic1 = MIN(ic0 + dc, nc);
// work data for thread
const int wo = (ne + CACHE_LINE_SIZE_F32)*ith;
ggml_fp16_t * const wdata = params->wdata;
for (int i13 = 0; i13 < ne13; ++i13) {
for (int i12 = 0; i12 < ne12; ++i12) {
for (int i11 = 0; i11 < ne11; ++i11) {
// dst indices
const int i1 = i11;
const int i2 = i12;
const int i3 = i13;
ggml_fp16_t * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0;
for (int ic = ic0; ic < ic1; ++ic) {
// src1 indices
const int i10 = ic;
// src0 indices
const int i03 = i13;
const int i02 = i12;
const int i00 = ic;
assert(sizeof(ggml_fp16_t)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize);
ggml_fp16_t * src0_col = (ggml_fp16_t *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03));
float src1_val = * (float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
ggml_vec_mad_f16(ne01, dst_row, src0_col, src1_val);
}
}
}
}
}
//int64_t t1 = ggml_time_us(); //int64_t t1 = ggml_time_us();
//static int64_t acc = 0; //static int64_t acc = 0;
@ -6441,7 +6169,7 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
const int ne1 = dst->ne[1]; const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2]; const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3]; const int ne3 = dst->ne[3];
const int ne = ne0*ne1*ne2*ne3; //const int ne = ne0*ne1*ne2*ne3;
const int nb00 = src0->nb[0]; const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1]; const int nb01 = src0->nb[1];
@ -6467,7 +6195,7 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
GGML_ASSERT(ne3 == ne13); GGML_ASSERT(ne3 == ne13);
// TODO: we don't support permuted src0 // TODO: we don't support permuted src0
GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0] || nb01 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0]); GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0]);
// dst cannot be transposed or permuted // dst cannot be transposed or permuted
GGML_ASSERT(nb0 == sizeof(float)); GGML_ASSERT(nb0 == sizeof(float));
@ -6482,9 +6210,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
//
// nb00 < nb01 - src0 is transposed
// compute by src0 columns
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
@ -6509,9 +6234,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
{ {
size_t id = 0; size_t id = 0;
for (int i01 = 0; i01 < ne01; ++i01) { for (int i01 = 0; i01 < ne01; ++i01) {
//for (int i00 = 0; i00 < ne00; ++i00) {
// wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
//}
dequantize_row_q4_0((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); dequantize_row_q4_0((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
id += ne00; id += ne00;
} }
@ -6538,16 +6260,11 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
#endif #endif
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
//printf("HHHHHHHHH ith = %d, nth = %d\n", ith, nth);
if (nb01 >= nb00) {
char * wdata = params->wdata; char * wdata = params->wdata;
for (int i13 = 0; i13 < ne13; ++i13) { for (int i13 = 0; i13 < ne13; ++i13) {
for (int i12 = 0; i12 < ne12; ++i12) { for (int i12 = 0; i12 < ne12; ++i12) {
for (int i11 = 0; i11 < ne11; ++i11) { for (int i11 = 0; i11 < ne11; ++i11) {
//for (int i10 = 0; i10 < ne10; ++i10) {
// wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));
//}
quantize_row_q4_0((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); quantize_row_q4_0((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);
wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]; wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0];
} }
@ -6557,35 +6274,10 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
return; return;
} }
// TODO: fix this memset (wsize is overestimated)
memset(params->wdata, 0, params->wsize);
return;
}
if (params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_FINALIZE) {
if (nb01 >= nb00) {
return; return;
} }
float * const wdata = params->wdata;
// cols per thread
const int dc = (ne + nth - 1)/nth;
// col range for this thread
const int ic0 = dc*ith;
const int ic1 = MIN(ic0 + dc, ne);
ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0);
for (int k = 1; k < nth; k++) {
ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0);
}
return;
}
if (nb01 >= nb00) {
// TODO: do not support transposed src1 // TODO: do not support transposed src1
// parallelize by src0 rows using ggml_vec_dot_q4_0 // parallelize by src0 rows using ggml_vec_dot_q4_0
@ -6626,56 +6318,6 @@ static void ggml_compute_forward_mul_mat_q4_0_f32(
ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]))); ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0])));
} }
} }
} else {
//printf("AAAAA ith = %d, nth = %d\n", ith, nth);
// parallelize by src1 columns using ggml_vec_mad_q4_0
// each thread has its own work data
// during FINALIZE we accumulate all work data into dst
// total columns in src1
const int nc = ne10;
// columns per thread
const int dc = (nc + nth - 1)/nth;
// column range for this thread
const int ic0 = dc*ith;
const int ic1 = MIN(ic0 + dc, nc);
// work data for thread
const int wo = (ne + CACHE_LINE_SIZE_F32)*ith;
float * const wdata = params->wdata;
for (int i13 = 0; i13 < ne13; ++i13) {
for (int i12 = 0; i12 < ne12; ++i12) {
for (int i11 = 0; i11 < ne11; ++i11) {
// dst indices
const int i1 = i11;
const int i2 = i12;
const int i3 = i13;
float * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0;
for (int ic = ic0; ic < ic1; ++ic) {
// src1 indices
const int i10 = ic;
// src0 indices
const int i03 = i13;
const int i02 = i12;
const int i00 = ic;
assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize);
void * src0_col = (void *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03));
float src1_val = *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
ggml_vec_mad_q4_0(ne01, dst_row, src0_col, src1_val);
}
}
}
}
}
//int64_t t1 = ggml_time_us(); //int64_t t1 = ggml_time_us();
//static int64_t acc = 0; //static int64_t acc = 0;
@ -6712,7 +6354,7 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
const int ne1 = dst->ne[1]; const int ne1 = dst->ne[1];
const int ne2 = dst->ne[2]; const int ne2 = dst->ne[2];
const int ne3 = dst->ne[3]; const int ne3 = dst->ne[3];
const int ne = ne0*ne1*ne2*ne3; //const int ne = ne0*ne1*ne2*ne3;
const int nb00 = src0->nb[0]; const int nb00 = src0->nb[0];
const int nb01 = src0->nb[1]; const int nb01 = src0->nb[1];
@ -6738,7 +6380,7 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
GGML_ASSERT(ne3 == ne13); GGML_ASSERT(ne3 == ne13);
// TODO: we don't support permuted src0 // TODO: we don't support permuted src0
GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1] || nb01 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1]); GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1]);
// dst cannot be transposed or permuted // dst cannot be transposed or permuted
GGML_ASSERT(nb0 == sizeof(float)); GGML_ASSERT(nb0 == sizeof(float));
@ -6753,9 +6395,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
// nb01 >= nb00 - src0 is not transposed // nb01 >= nb00 - src0 is not transposed
// compute by src0 rows // compute by src0 rows
//
// nb00 < nb01 - src0 is transposed
// compute by src0 columns
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
@ -6780,9 +6419,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
{ {
size_t id = 0; size_t id = 0;
for (int i01 = 0; i01 < ne01; ++i01) { for (int i01 = 0; i01 < ne01; ++i01) {
//for (int i00 = 0; i00 < ne00; ++i00) {
// wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
//}
dequantize_row_q4_1((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); dequantize_row_q4_1((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
id += ne00; id += ne00;
} }
@ -6809,8 +6445,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
#endif #endif
if (params->type == GGML_TASK_INIT) { if (params->type == GGML_TASK_INIT) {
//printf("HHHHHHHHH ith = %d, nth = %d\n", ith, nth);
if (nb01 >= nb00) {
char * wdata = params->wdata; char * wdata = params->wdata;
for (int i13 = 0; i13 < ne13; ++i13) { for (int i13 = 0; i13 < ne13; ++i13) {
@ -6828,35 +6462,10 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
return; return;
} }
// TODO: fix this memset (wsize is overestimated)
memset(params->wdata, 0, params->wsize);
return;
}
if (params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_FINALIZE) {
if (nb01 >= nb00) {
return; return;
} }
float * const wdata = params->wdata;
// cols per thread
const int dc = (ne + nth - 1)/nth;
// col range for this thread
const int ic0 = dc*ith;
const int ic1 = MIN(ic0 + dc, ne);
ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0);
for (int k = 1; k < nth; k++) {
ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0);
}
return;
}
if (nb01 >= nb00) {
// TODO: do not support transposed src1 // TODO: do not support transposed src1
// parallelize by src0 rows using ggml_vec_dot_q4_1 // parallelize by src0 rows using ggml_vec_dot_q4_1
@ -6897,56 +6506,6 @@ static void ggml_compute_forward_mul_mat_q4_1_f32(
ggml_vec_dot_q4_1(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]))); ggml_vec_dot_q4_1(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1])));
} }
} }
} else {
//printf("AAAAA ith = %d, nth = %d\n", ith, nth);
// parallelize by src1 columns using ggml_vec_mad_q4_1
// each thread has its own work data
// during FINALIZE we accumulate all work data into dst
// total columns in src1
const int nc = ne10;
// columns per thread
const int dc = (nc + nth - 1)/nth;
// column range for this thread
const int ic0 = dc*ith;
const int ic1 = MIN(ic0 + dc, nc);
// work data for thread
const int wo = (ne + CACHE_LINE_SIZE_F32)*ith;
float * const wdata = params->wdata;
for (int i13 = 0; i13 < ne13; ++i13) {
for (int i12 = 0; i12 < ne12; ++i12) {
for (int i11 = 0; i11 < ne11; ++i11) {
// dst indices
const int i1 = i11;
const int i2 = i12;
const int i3 = i13;
float * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0;
for (int ic = ic0; ic < ic1; ++ic) {
// src1 indices
const int i10 = ic;
// src0 indices
const int i03 = i13;
const int i02 = i12;
const int i00 = ic;
assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize);
void * src0_col = (void *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03));
float src1_val = *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
ggml_vec_mad_q4_1(ne01, dst_row, src0_col, src1_val);
}
}
}
}
}
//int64_t t1 = ggml_time_us(); //int64_t t1 = ggml_time_us();
//static int64_t acc = 0; //static int64_t acc = 0;
@ -9588,11 +9147,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
size_t cur = 0; size_t cur = 0;
// TODO: better way to determine if the matrix is transposed
if (node->src0->nb[1] < node->src0->nb[0]) {
cur = ggml_nbytes(node)*node->n_tasks; // TODO: this can become (n_tasks-1)
// TODO: overestimated by factor of x2 for FP16
} else {
if (node->src0->type == GGML_TYPE_F16 && if (node->src0->type == GGML_TYPE_F16 &&
node->src1->type == GGML_TYPE_F32) { node->src1->type == GGML_TYPE_F32) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
@ -9639,7 +9193,6 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
} }
}
work_size = MAX(work_size, cur); work_size = MAX(work_size, cur);
} break; } break;

View file

@ -1261,10 +1261,10 @@ static llama_vocab::id llama_sample_top_p_top_k(
double repeat_penalty) { double repeat_penalty) {
auto & rng = lctx.rng; auto & rng = lctx.rng;
const auto & vocab = lctx.vocab; const int n_logits = lctx.model.hparams.n_vocab;
const auto & logits = lctx.logits;
int n_logits = vocab.id_to_token.size(); const auto & logits = lctx.logits;
const auto * plogits = logits.data() + logits.size() - n_logits;
std::vector<std::pair<double, llama_vocab::id>> logits_id; std::vector<std::pair<double, llama_vocab::id>> logits_id;
logits_id.reserve(n_logits); logits_id.reserve(n_logits);
@ -1276,13 +1276,13 @@ static llama_vocab::id llama_sample_top_p_top_k(
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) { if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
if (logits[i] < 0.0) { if (plogits[i] < 0.0) {
logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i)); logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
} else { } else {
logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i)); logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
} }
} else { } else {
logits_id.push_back(std::make_pair(logits[i]*scale, i)); logits_id.push_back(std::make_pair(plogits[i]*scale, i));
} }
} }
} }
@ -1677,6 +1677,8 @@ struct llama_context * llama_init_from_file(
} }
const auto & hparams = ctx->model.hparams; const auto & hparams = ctx->model.hparams;
// resized during inference
if (params.logits_all) { if (params.logits_all) {
ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab); ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
} else { } else {
@ -1684,7 +1686,7 @@ struct llama_context * llama_init_from_file(
} }
if (params.embedding){ if (params.embedding){
ctx->embedding.reserve(hparams.n_embd); ctx->embedding.resize(hparams.n_embd);
} }
ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type)); ctx->buf_compute.resize(MEM_REQ_EVAL.at(ctx->model.type));
@ -1761,6 +1763,10 @@ int llama_n_ctx(struct llama_context * ctx) {
return ctx->model.hparams.n_ctx; return ctx->model.hparams.n_ctx;
} }
int llama_n_embd(struct llama_context * ctx) {
return ctx->model.hparams.n_embd;
}
float * llama_get_logits(struct llama_context * ctx) { float * llama_get_logits(struct llama_context * ctx) {
return ctx->logits.data(); return ctx->logits.data();
} }

View file

@ -109,6 +109,7 @@ extern "C" {
LLAMA_API int llama_n_vocab(struct llama_context * ctx); LLAMA_API int llama_n_vocab(struct llama_context * ctx);
LLAMA_API int llama_n_ctx (struct llama_context * ctx); LLAMA_API int llama_n_ctx (struct llama_context * ctx);
LLAMA_API int llama_n_embd (struct llama_context * ctx);
// Token logits obtained from the last call to llama_eval() // Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row // The logits for the last token are stored in the last row

View file

@ -1,7 +1,7 @@
function(llama_add_test source) function(llama_add_test source)
get_filename_component(TEST_TARGET ${source} NAME_WE) get_filename_component(TEST_TARGET ${source} NAME_WE)
add_executable(${TEST_TARGET} ${source}) add_executable(${TEST_TARGET} ${source})
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils) target_link_libraries(${TEST_TARGET} PRIVATE llama)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN}) add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
endfunction() endfunction()

View file

@ -1,9 +1,9 @@
#include "utils.h"
#include "llama.h" #include "llama.h"
#include <cstdio> #include <cstdio>
#include <string> #include <string>
#include <map> #include <map>
#include <vector>
static const std::map<std::string, std::vector<llama_token>> k_tests = { static const std::map<std::string, std::vector<llama_token>> k_tests = {
{ "Hello World", { 1, 10994, 2787, }, }, { "Hello World", { 1, 10994, 2787, }, },
@ -48,7 +48,9 @@ int main(int argc, char **argv) {
} }
for (const auto & test_kv : k_tests) { for (const auto & test_kv : k_tests) {
const auto res = ::llama_tokenize(ctx, test_kv.first, true); std::vector<llama_token> res(test_kv.first.size());
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true);
res.resize(n);
bool correct = res.size() == test_kv.second.size(); bool correct = res.size() == test_kv.second.size();