Merge branch 'master' into concedo_experimental

# Conflicts:
#	CMakeLists.txt
#	Makefile
#	build.zig
#	flake.lock
#	flake.nix
#	ggml.c
This commit is contained in:
Concedo 2023-10-31 20:44:04 +08:00
commit cc5b282350
24 changed files with 2609 additions and 2512 deletions

View file

@ -1,7 +1,7 @@
---
name: Bug template
about: Used to report bugs in llama.cpp
labels: ["bug"]
labels: ["bug-unconfirmed"]
assignees: ''
---

View file

@ -53,7 +53,6 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON)
#
@ -67,7 +66,6 @@ set(CMAKE_C_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
add_compile_definitions(GGML_USE_K_QUANTS)
add_compile_definitions(LOG_DISABLE_LOGS)
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
@ -358,8 +356,8 @@ add_library(ggml OBJECT
ggml-alloc.h
ggml-backend.c
ggml-backend.h
k_quants.h
k_quants.c
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA})
target_include_directories(ggml PUBLIC . ./otherarch ./otherarch/tools)
target_compile_features(ggml PUBLIC c_std_11) # don't bump

View file

@ -43,11 +43,6 @@ CFLAGS = -I. -I./include -I./include/CL -I./otherarch -I./otherarch
CXXFLAGS = -I. -I./common -I./include -I./include/CL -I./otherarch -I./otherarch/tools -Ofast -DNDEBUG -std=c++11 -fPIC -DLOG_DISABLE_LOGS -D_GNU_SOURCE
LDFLAGS =
ifndef LLAMA_NO_K_QUANTS
CFLAGS += -DGGML_USE_K_QUANTS
CXXFLAGS += -DGGML_USE_K_QUANTS
endif
# these are used on windows, to build some libraries with extra old device compatibility
SIMPLECFLAGS =
FULLCFLAGS =
@ -340,34 +335,27 @@ $(info )
# Build library
#
ggml.o: ggml.c ggml.h ggml-cuda.h k_quants.h
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
ggml_openblas.o: ggml.c ggml.h ggml-cuda.h k_quants.h
ggml_openblas.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
ggml_failsafe.o: ggml.c ggml.h ggml-cuda.h k_quants.h
ggml_failsafe.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
ggml_noavx2.o: ggml.c ggml.h ggml-cuda.h k_quants.h
ggml_noavx2.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
ggml_clblast.o: ggml.c ggml.h ggml-cuda.h k_quants.h
ggml_clblast.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
ggml_cublas.o: ggml.c ggml.h ggml-cuda.h k_quants.h
ggml_cublas.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) $(FULLCFLAGS) $(CUBLAS_FLAGS) $(HIPFLAGS) -c $< -o $@
#quants K
KQ1 =
KQ2 =
KQ3 =
ifndef LLAMA_NO_K_QUANTS
KQ1 = k_quants.o
KQ2 = k_quants_noavx2.o
KQ3 = k_quants_failsafe.o
k_quants.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
#quants
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h
$(CC) $(CFLAGS) $(FULLCFLAGS) -c $< -o $@
k_quants_noavx2.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
ggml-quants_noavx2.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h
$(CC) $(CFLAGS) $(SIMPLECFLAGS) -c $< -o $@
k_quants_failsafe.o: k_quants.c k_quants.h ggml.h ggml-cuda.h
ggml-quants_failsafe.o: ggml-quants.c ggml.h ggml-quants.h ggml-cuda.h
$(CC) $(CFLAGS) $(NONECFLAGS) -c $< -o $@
endif # LLAMA_NO_K_QUANTS
#there's no intrinsics or special gpu ops used here, so we can have a universal object
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
@ -429,7 +417,7 @@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
clean:
rm -vf *.o main quantize_llama quantize_gpt2 quantize_gptj quantize_neox quantize_mpt quantize-stats perplexity embedding benchmark-matmult save-load-state gguf gguf.exe main.exe quantize_llama.exe quantize_gptj.exe quantize_gpt2.exe quantize_neox.exe quantize_mpt.exe koboldcpp_default.dll koboldcpp_openblas.dll koboldcpp_failsafe.dll koboldcpp_noavx2.dll koboldcpp_clblast.dll koboldcpp_cublas.dll koboldcpp_hipblas.dll koboldcpp_default.so koboldcpp_openblas.so koboldcpp_failsafe.so koboldcpp_noavx2.so koboldcpp_clblast.so koboldcpp_cublas.so koboldcpp_hipblas.so
main: examples/main/main.cpp common/sampling.cpp build-info.h ggml.o $(KQ1) ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
main: examples/main/main.cpp common/sampling.cpp build-info.h ggml.o ggml-quants.o ggml-alloc.o ggml-backend.o llama.o common.o console.o grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@ -440,11 +428,11 @@ gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
#generated libraries
koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
koboldcpp_default: ggml.o ggml_v2.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
$(DEFAULT_BUILD)
ifdef OPENBLAS_BUILD
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
koboldcpp_openblas: ggml_openblas.o ggml_v2_openblas.o ggml_v1.o expose.o common.o gpttype_adapter.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
$(OPENBLAS_BUILD)
else
koboldcpp_openblas:
@ -452,7 +440,7 @@ koboldcpp_openblas:
endif
ifdef FAILSAFE_BUILD
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ3) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
koboldcpp_failsafe: ggml_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o ggml-quants_failsafe.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
$(FAILSAFE_BUILD)
else
koboldcpp_failsafe:
@ -460,7 +448,7 @@ koboldcpp_failsafe:
endif
ifdef NOAVX2_BUILD
koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o $(KQ2) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
koboldcpp_noavx2: ggml_noavx2.o ggml_v2_noavx2.o ggml_v1_failsafe.o expose.o common.o gpttype_adapter_failsafe.o ggml-quants_noavx2.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
$(NOAVX2_BUILD)
else
koboldcpp_noavx2:
@ -468,7 +456,7 @@ koboldcpp_noavx2:
endif
ifdef CLBLAST_BUILD
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
koboldcpp_clblast: ggml_clblast.o ggml_v2_clblast.o ggml_v1.o expose.o common.o gpttype_adapter_clblast.o ggml-opencl.o ggml_v2-opencl.o ggml_v2-opencl-legacy.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(OBJS)
$(CLBLAST_BUILD)
else
koboldcpp_clblast:
@ -476,7 +464,7 @@ koboldcpp_clblast:
endif
ifdef CUBLAS_BUILD
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
koboldcpp_cublas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(CUBLAS_OBJS) $(OBJS)
$(CUBLAS_BUILD)
else
koboldcpp_cublas:
@ -484,7 +472,7 @@ koboldcpp_cublas:
endif
ifdef HIPBLAS_BUILD
koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o $(KQ1) ggml-alloc.o ggml-backend.o grammar-parser.o $(HIP_OBJS) $(OBJS)
koboldcpp_hipblas: ggml_cublas.o ggml_v2_cublas.o ggml_v1.o expose.o common.o gpttype_adapter_cublas.o ggml-quants.o ggml-alloc.o ggml-backend.o grammar-parser.o $(HIP_OBJS) $(OBJS)
$(HIPBLAS_BUILD)
else
koboldcpp_hipblas:
@ -492,15 +480,15 @@ koboldcpp_hipblas:
endif
# tools
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o
quantize_llama: examples/quantize/quantize.cpp ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_gptj: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
quantize_gptj: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o otherarch/tools/gptj_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_gpt2: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
quantize_gpt2: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o otherarch/tools/gpt2_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_neox: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
quantize_neox: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o otherarch/tools/neox_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize_mpt: ggml.o llama.o $(KQ1) ggml-alloc.o ggml-backend.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
quantize_mpt: ggml.o llama.o ggml-quants.o ggml-alloc.o ggml-backend.o otherarch/tools/mpt_quantize.cpp otherarch/tools/common-ggml.cpp
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

View file

@ -42,13 +42,12 @@ let package = Package(
"llama.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"k_quants.c",
"ggml-quants.c",
] + additionalSources,
resources: resources,
publicHeadersPath: "spm-headers",
cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.define("GGML_USE_K_QUANTS"),
.define("GGML_USE_ACCELERATE")
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14

View file

@ -224,6 +224,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
sparams.temp = std::stof(argv[i]);
sparams.temp = std::max(sparams.temp, 0.0f);
} else if (arg == "--tfs") {
if (++i >= argc) {
invalid_param = true;
@ -743,7 +744,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#endif // GGML_USE_CUBLAS
#endif
printf(" --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
@ -888,7 +889,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
llama_kv_cache_tokens_rm(lctx, -1, -1);
llama_kv_cache_clear(lctx);
llama_reset_timings(lctx);
}

View file

@ -167,8 +167,12 @@ llama_token llama_sampling_sample(
llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
}
if (temp <= 0) {
// greedy sampling
if (temp < 0.0) {
// greedy sampling, with probs
llama_sample_softmax(ctx_main, &cur_p);
id = cur_p.data[0].id;
} else if (temp == 0.0) {
// greedy sampling, no probs
id = llama_sample_token_greedy(ctx_main, &cur_p);
} else {
if (mirostat == 1) {

View file

@ -366,15 +366,18 @@ class SentencePieceVocab:
added_tokens = {}
vocab_size: int = self.sentencepiece_tokenizer.vocab_size()
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
raise Exception(f"Expected added token IDs to be sequential and start at {vocab_size}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base: int = vocab_size
self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
actual_new_ids = sorted(new_tokens.keys())
if expected_new_ids != actual_new_ids:
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
# Token pieces that were added to the base vocabulary.
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
self.fname_added_tokens = fname_added_tokens

View file

@ -185,7 +185,7 @@ int main(int argc, char ** argv) {
const auto t_pp_start = ggml_time_us();
llama_kv_cache_tokens_rm(ctx, -1, -1);
llama_kv_cache_clear(ctx);
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
LOG_TEE("%s: llama_decode() failed\n", __func__);

View file

@ -1037,7 +1037,7 @@ int main(int argc, char ** argv) {
test t(inst, lmodel, ctx);
llama_kv_cache_tokens_rm(ctx, -1, -1);
llama_kv_cache_clear(ctx);
// warmup run
if (t.n_prompt > 0) {
@ -1048,7 +1048,7 @@ int main(int argc, char ** argv) {
}
for (int i = 0; i < params.reps; i++) {
llama_kv_cache_tokens_rm(ctx, -1, -1);
llama_kv_cache_clear(ctx);
uint64_t t_start = get_time_ns();
if (t.n_prompt > 0) {

View file

@ -298,7 +298,7 @@ int main(int argc, char ** argv) {
}
// remove any "future" tokens that we might have inherited from the previous session
llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1);
llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
}
LOGLN(

View file

@ -210,7 +210,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_cache_tokens_rm(ctx, -1, -1);
llama_kv_cache_clear(ctx);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
@ -339,7 +339,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
const auto t_start = std::chrono::high_resolution_clock::now();
// clear the KV cache
llama_kv_cache_tokens_rm(ctx, -1, -1);
llama_kv_cache_clear(ctx);
for (int j = 0; j < num_batches; ++j) {
const int batch_start = start + j * n_batch;
@ -573,7 +573,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
}
// clear the KV cache
llama_kv_cache_tokens_rm(ctx, -1, -1);
llama_kv_cache_clear(ctx);
auto logits = hellaswag_evaluate_tokens(ctx, query_embd, 0, params.n_batch, n_vocab);
if (logits.empty()) {

View file

@ -18,7 +18,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
#ifdef GGML_USE_K_QUANTS
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
@ -31,7 +30,6 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, -0.0008 ppl @ LLaMA-v1-7B", },
#endif
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
@ -70,13 +68,14 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
}
// usage:
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
// ./quantize [--allow-requantize] [--leave-output-tensor] [--pure] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
//
[[noreturn]]
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
printf("\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) {
if (it.name != "COPY") {
@ -103,6 +102,8 @@ int main(int argc, char ** argv) {
params.quantize_output_tensor = false;
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
params.allow_requantize = true;
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
params.pure = true;
} else {
usage(argv[0]);
}

View file

@ -857,7 +857,7 @@ struct llama_server_context
void kv_cache_clear() {
// clear the entire KV cache
llama_kv_cache_tokens_rm(ctx, -1, -1);
llama_kv_cache_clear(ctx);
clean_kv_cache = false;
}

View file

@ -148,7 +148,7 @@ int main(int argc, char ** argv) {
std::vector<seq_draft> drafts(n_seq_dft);
params.sparams.grammar.clear(); // the draft samplers will copy the target sampler's grammar
params.sparams.temp = std::max(0.01f, params.sparams.temp);
params.sparams.temp = -1.0f; // force greedy sampling with probs for the draft model
for (int s = 0; s < n_seq_dft; ++s) {
drafts[s].ctx_sampling = llama_sampling_init(params.sparams);

View file

@ -210,6 +210,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
NSString * sourcePath = [bundle pathForResource:@"ggml-metal" ofType:@"metal"];
if (sourcePath == nil) {
GGML_METAL_LOG_WARN("%s: error: could not use bundle path to find ggml-metal.metal, falling back to trying cwd\n", __func__);
sourcePath = @"ggml-metal.metal";
}
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [sourcePath UTF8String]);
NSString * src = [NSString stringWithContentsOfFile:sourcePath encoding:NSUTF8StringEncoding error:&error];
if (error) {

File diff suppressed because it is too large Load diff

View file

@ -1,20 +1,14 @@
#pragma once
// This is a private API for quantization and dequantization
// Should not be used directly, use ggml.h instead
#include "ggml.h"
#include <stdint.h>
#include <assert.h>
#include <stddef.h>
// Super-block size
#ifdef GGML_QKK_64
#define QK_K 64
#define K_SCALE_SIZE 4
#else
#define QK_K 256
#define K_SCALE_SIZE 12
#endif
#ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
@ -23,10 +17,66 @@
#endif
#endif
#define QK4_0 32
typedef struct {
ggml_fp16_t d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0;
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
#define QK4_1 32
typedef struct {
ggml_fp16_t d; // delta
ggml_fp16_t m; // min
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
#define QK5_0 32
typedef struct {
ggml_fp16_t d; // delta
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} block_q5_0;
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
#define QK5_1 32
typedef struct {
ggml_fp16_t d; // delta
ggml_fp16_t m; // min
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
#define QK8_0 32
typedef struct {
ggml_fp16_t d; // delta
int8_t qs[QK8_0]; // quants
} block_q8_0;
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
#define QK8_1 32
typedef struct {
float d; // delta
float s; // d * sum(qs[i])
int8_t qs[QK8_1]; // quants
} block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
//
// Super-block quantization structures
//
// Super-block size
#ifdef GGML_QKK_64
#define QK_K 64
#define K_SCALE_SIZE 4
#else
#define QK_K 256
#define K_SCALE_SIZE 12
#endif
// 2-bit quantization
// weight is represented as x = a * q + b
// 16 blocks of 16 elements each
@ -127,6 +177,13 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
// Quantization
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
@ -134,6 +191,13 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
@ -142,6 +206,13 @@ void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
// Dequantization
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
@ -150,16 +221,14 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
// Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
// Quantization with histogram collection
size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);

2302
ggml.c

File diff suppressed because it is too large Load diff

7
ggml.h
View file

@ -1937,12 +1937,19 @@ extern "C" {
// quantization
//
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
//

View file

@ -963,7 +963,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
//determine mem per token
std::vector<int> tmp = {1, 2, 3, 4};
llama_kv_cache_tokens_rm(llama_ctx_v4, -1, -1);
llama_kv_cache_clear(llama_ctx_v4);
auto er = llama_decode(llama_ctx_v4, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
if(er!=0)
{

274
llama.cpp
View file

@ -20,7 +20,6 @@
#ifdef GGML_USE_MPI
# include "ggml-mpi.h"
#endif
#ifdef GGML_USE_K_QUANTS
#ifndef QK_K
# ifdef GGML_QKK_64
# define QK_K 64
@ -28,7 +27,6 @@
# define QK_K 256
# endif
#endif
#endif
#ifdef __has_include
#if __has_include(<unistd.h>)
@ -1473,17 +1471,12 @@ static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
return 0;
}
static void llama_kv_cache_tokens_rm(struct llama_kv_cache & cache, int32_t c0, int32_t c1) {
if (c0 < 0) c0 = 0;
if (c1 < 0) c1 = cache.size;
for (int32_t i = c0; i < c1; ++i) {
static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
for (int32_t i = 0; i < cache.size; ++i) {
cache.cells[i].pos = -1;
cache.cells[i].seq_id.clear();
}
// Searching for a free slot can start here since we know it will be empty.
cache.head = uint32_t(c0);
cache.head = 0;
}
static void llama_kv_cache_seq_rm(
@ -1497,8 +1490,14 @@ static void llama_kv_cache_seq_rm(
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
if (seq_id < 0) {
cache.cells[i].seq_id.clear();
} else if (cache.cells[i].has_seq_id(seq_id)) {
cache.cells[i].seq_id.erase(seq_id);
} else {
continue;
}
if (cache.cells[i].seq_id.empty()) {
cache.cells[i].pos = -1;
if (new_head == cache.size) new_head = i;
@ -1559,14 +1558,14 @@ static void llama_kv_cache_seq_shift(
for (uint32_t i = 0; i < cache.size; ++i) {
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
cache.has_shift = true;
cache.cells[i].pos += delta;
cache.cells[i].delta += delta;
if (cache.cells[i].pos < 0) {
cache.cells[i].pos = -1;
cache.cells[i].seq_id.clear();
if (new_head == cache.size) new_head = i;
} else {
cache.has_shift = true;
cache.cells[i].delta = delta;
}
}
}
@ -2772,19 +2771,19 @@ static void llm_load_tensors(
layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split);
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split);
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split);
layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split);
layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
@ -4641,6 +4640,8 @@ static struct ggml_cgraph * llm_build_starcoder(
const float norm_eps = hparams.f_norm_eps;
const int n_gpu_layers = model.n_gpu_layers;
const int32_t n_tokens = batch.n_tokens;
const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n;
const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head;
@ -4685,6 +4686,27 @@ static struct ggml_cgraph * llm_build_starcoder(
}
}
const int i_gpu_start = n_layer - n_gpu_layers;
(void) i_gpu_start;
// offload functions set the tensor output backend to GPU
// tensors are GPU-accelerated if any input or the output has been offloaded
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
offload_func_t offload_func_kq = llama_nop;
offload_func_t offload_func_v = llama_nop;
#ifdef GGML_USE_CUBLAS
if (n_gpu_layers > n_layer) {
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
}
if (n_gpu_layers > n_layer + 1) {
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
}
if (n_gpu_layers > n_layer + 2) {
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
}
#endif // GGML_USE_CUBLAS
{
// Compute position embeddings.
struct ggml_tensor * inp_positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@ -4710,6 +4732,7 @@ static struct ggml_cgraph * llm_build_starcoder(
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
ggml_set_name(KQ_mask, "KQ_mask");
offload_func_kq(KQ_mask);
ggml_allocr_alloc(lctx.alloc, KQ_mask);
if (!ggml_allocr_is_measure(lctx.alloc)) {
float * data = (float *) KQ_mask->data;
@ -4733,44 +4756,67 @@ static struct ggml_cgraph * llm_build_starcoder(
ggml_set_name(inpL, "inpL");
for (int il = 0; il < n_layer; ++il) {
offload_func_t offload_func = llama_nop;
#ifdef GGML_USE_CUBLAS
if (il >= i_gpu_start) {
offload_func = ggml_cuda_assign_buffers_no_alloc;
}
#endif // GGML_USE_CUBLAS
{
// Norm
cur = ggml_norm(ctx0, inpL, norm_eps);
offload_func(cur);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b);
offload_func(cur);
}
{
// Self Attention
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv);
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
offload_func_kq(cur);
struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd);
struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa));
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
offload_func_kq(cur);
struct ggml_tensor * Qcur = tmpq;
struct ggml_tensor * tmpq = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
struct ggml_tensor * tmpk = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
struct ggml_tensor * tmpv = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
ggml_set_name(tmpq, "tmpq");
ggml_set_name(tmpk, "tmpk");
ggml_set_name(tmpv, "tmpv");
offload_func_kq(tmpq);
offload_func_kq(tmpk);
offload_func_v (tmpv);
struct ggml_tensor * Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens);
struct ggml_tensor * Kcur = tmpk;
{
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens));
struct ggml_tensor * Vcur = ggml_transpose(ctx0, tmpv);
offload_func_v(Vcur);
ggml_set_name(Vcur, "Vcur");
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
offload_func_kq(k);
ggml_set_name(k, "k");
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
offload_func_v(v);
ggml_set_name(v, "v");
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
}
struct ggml_tensor * Q =
ggml_permute(ctx0,
ggml_cpy(ctx0,
Qcur,
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)),
0, 2, 1, 3);
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
offload_func_kq(Q);
ggml_set_name(Q, "Q");
struct ggml_tensor * K =
@ -4779,23 +4825,28 @@ static struct ggml_cgraph * llm_build_starcoder(
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
offload_func_kq(K);
ggml_set_name(K, "K");
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
offload_func_kq(KQ);
ggml_set_name(KQ, "KQ");
// KQ_scaled = KQ / sqrt(n_embd_head)
// KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
offload_func_kq(KQ_scaled);
ggml_set_name(KQ_scaled, "KQ_scaled");
// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
offload_func_kq(KQ_masked);
ggml_set_name(KQ_masked, "KQ_masked");
// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
offload_func_v(KQ_soft_max);
ggml_set_name(KQ_soft_max, "KQ_soft_max");
// split cached V into n_head heads
@ -4808,22 +4859,25 @@ static struct ggml_cgraph * llm_build_starcoder(
ggml_set_name(V, "V");
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
offload_func_v(KQV);
ggml_set_name(KQV, "KQV");
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
offload_func_v(KQV_merged);
ggml_set_name(KQV_merged, "KQV_merged");
// cur = KQV_merged.contiguous().view(n_embd, n_tokens)
cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens);
offload_func_v(cur);
ggml_set_name(cur, "KQV_merged_contiguous");
}
// Projection
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo);
offload_func(cur);
// Add the input
cur = ggml_add(ctx0, cur, inpL);
offload_func(cur);
struct ggml_tensor * inpFF = cur;
@ -4832,27 +4886,36 @@ static struct ggml_cgraph * llm_build_starcoder(
// Norm
{
cur = ggml_norm(ctx0, inpFF, norm_eps);
offload_func_nr(cur);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b);
offload_func_nr(cur);
}
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3);
offload_func(cur);
// GELU activation
cur = ggml_gelu(ctx0, cur);
offload_func(cur);
// Projection
cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2);
offload_func(cur);
}
inpL = ggml_add(ctx0, cur, inpFF);
}
// Output Norm
{
cur = ggml_norm(ctx0, inpL, norm_eps);
offload_func_nr(cur);
cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b);
}
ggml_set_name(cur, "result_norm");
}
cur = ggml_mul_mat(ctx0, model.output, cur);
ggml_set_name(cur, "result_output");
@ -6038,11 +6101,20 @@ static int llama_decode_internal(
#endif
// update the kv ring buffer
lctx.kv_self.has_shift = false;
lctx.kv_self.head += n_tokens;
{
if (kv_self.has_shift) {
kv_self.has_shift = false;
for (uint32_t i = 0; i < kv_self.size; ++i) {
kv_self.cells[i].delta = 0;
}
}
kv_self.head += n_tokens;
// Ensure kv cache head points to a valid index.
if (lctx.kv_self.head >= lctx.kv_self.size) {
lctx.kv_self.head = 0;
if (kv_self.head >= kv_self.size) {
kv_self.head = 0;
}
}
#ifdef GGML_PERF
@ -8243,6 +8315,24 @@ struct no_init {
no_init() { /* do nothing */ }
};
struct quantize_state_internal {
const llama_model & model;
const llama_model_quantize_params * params;
int n_attention_wv = 0;
int n_feed_forward_w2 = 0;
int i_attention_wv = 0;
int i_feed_forward_w2 = 0;
int n_k_quantized = 0;
int n_fallback = 0;
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
: model(model)
, params(params)
{}
};
static void llama_convert_tensor_internal(
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread
@ -8301,14 +8391,14 @@ static void llama_convert_tensor_internal(
workers.clear();
}
#ifdef GGML_USE_K_QUANTS
static ggml_type get_k_quant_type(
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
quantize_state_internal & qs,
ggml_type new_type, const ggml_tensor * tensor, llama_ftype ftype
) {
const std::string name = ggml_get_name(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants
const auto tn = LLM_TN(model.arch);
const llm_arch arch = qs.model.arch;
const auto tn = LLM_TN(arch);
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
@ -8316,7 +8406,7 @@ static ggml_type get_k_quant_type(
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
int nx = tensor->ne[0];
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0;
}
else if (new_type != GGML_TYPE_Q8_0) {
@ -8325,46 +8415,46 @@ static ggml_type get_k_quant_type(
} else if (name.find("attn_v.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
use_more_bits(qs.i_attention_wv, qs.n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && qs.i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
if (model.type == MODEL_70B) {
(qs.i_attention_wv < qs.n_attention_wv/8 || qs.i_attention_wv >= 7*qs.n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
if (qs.model.type == MODEL_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
}
++*i_attention_wv;
++qs.i_attention_wv;
} else if (name.find("ffn_down.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
: GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
if (model.arch == LLM_ARCH_FALCON) {
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
if (arch == LLM_ARCH_FALCON) {
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
} else {
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
}
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
new_type = GGML_TYPE_Q5_K;
}
++*i_feed_forward_w2;
++qs.i_feed_forward_w2;
} else if (name.find("attn_output.weight") != std::string::npos) {
if (model.arch != LLM_ARCH_FALCON) {
if (arch != LLM_ARCH_FALCON) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
@ -8391,25 +8481,27 @@ static ggml_type get_k_quant_type(
int nx = tensor->ne[0];
int ny = tensor->ne[1];
if (nx % QK_K != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for %s", __func__, nx, ny, QK_K, ggml_type_name(new_type));
convert_incompatible_tensor = true;
} else {
++qs.n_k_quantized;
}
}
if (convert_incompatible_tensor) {
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
} else {
throw std::runtime_error("Unsupported tensor size encountered\n");
switch (new_type) {
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
}
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
++qs.n_fallback;
}
return new_type;
}
#endif
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type quantized_type;
@ -8424,7 +8516,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break;
case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break;
#ifdef GGML_USE_K_QUANTS
// K-quants
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
@ -8435,7 +8526,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
#endif
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
}
@ -8462,6 +8553,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
llm_load_arch(ml, model);
llm_load_hparams(ml, model);
struct quantize_state_internal qs(model, params);
if (params->only_copy) {
ftype = model.ftype;
}
@ -8474,10 +8567,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
#ifdef GGML_USE_K_QUANTS
int n_attention_wv = 0;
int n_feed_forward_w2 = 0;
for (int i = 0; i < ml.n_tensors; ++i) {
struct ggml_tensor * meta = ml.get_tensor_meta(i);
@ -8485,21 +8574,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
// TODO: avoid hardcoded tensor names - use the TN_* constants
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
++n_attention_wv;
++qs.n_attention_wv;
}
else if (name.find("ffn_down.weight") != std::string::npos) {
++n_feed_forward_w2;
++qs.n_feed_forward_w2;
}
}
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
if (qs.n_attention_wv != qs.n_feed_forward_w2 || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
__func__, qs.n_attention_wv, qs.n_feed_forward_w2, model.hparams.n_layer);
}
int i_attention_wv = 0;
int i_feed_forward_w2 = 0;
#endif
size_t total_size_org = 0;
size_t total_size_new = 0;
std::vector<int64_t> hist_all(1 << 4, 0);
@ -8563,11 +8648,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (quantize) {
new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS
new_type = get_k_quant_type(
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
);
#endif
if (!params->pure) {
new_type = get_k_quant_type(qs, new_type, tensor, ftype);
}
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
quantize = tensor->type != new_type;
@ -8692,6 +8776,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
LLAMA_LOG_INFO("\n");
}
}
if (qs.n_fallback > 0) {
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) incompatible with k-quants and required fallback quantization\n",
__func__, qs.n_fallback, qs.n_k_quantized + qs.n_fallback);
}
}
static int llama_apply_lora_from_file_internal(
@ -9019,6 +9108,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
/*.allow_requantize =*/ false,
/*.quantize_output_tensor =*/ true,
/*.only_copy =*/ false,
/*.pure =*/ false,
};
return result;
@ -9379,8 +9469,8 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
return ctx->kv_self.head;
}
void llama_kv_cache_tokens_rm(struct llama_context * ctx, int32_t c0, int32_t c1) {
llama_kv_cache_tokens_rm(ctx->kv_self, c0, c1);
void llama_kv_cache_clear(struct llama_context * ctx) {
llama_kv_cache_clear(ctx->kv_self);
}
void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
@ -9836,7 +9926,7 @@ int llama_eval(
llama_token * tokens,
int32_t n_tokens,
int n_past) {
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
if (ret < 0) {
@ -9851,7 +9941,7 @@ int llama_eval_embd(
float * embd,
int32_t n_tokens,
int n_past) {
llama_kv_cache_tokens_rm(ctx->kv_self, n_past, -1);
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };

13
llama.h
View file

@ -191,6 +191,7 @@ extern "C" {
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
bool pure; // disable k-quant mixtures and quantize all tensors to the same type
} llama_model_quantize_params;
// grammar types
@ -333,15 +334,12 @@ extern "C" {
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
// Remove all tokens data of cells in [c0, c1)
// c0 < 0 : [0, c1]
// c1 < 0 : [c0, inf)
LLAMA_API void llama_kv_cache_tokens_rm(
struct llama_context * ctx,
int32_t c0,
int32_t c1);
// Clear the KV cache
LLAMA_API void llama_kv_cache_clear(
struct llama_context * ctx);
// Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
// seq_id < 0 : match any sequence
// p0 < 0 : [0, p1]
// p1 < 0 : [p0, inf)
LLAMA_API void llama_kv_cache_seq_rm(
@ -660,6 +658,7 @@ extern "C" {
float * mu);
/// @details Selects the token with the highest probability.
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
LLAMA_API llama_token llama_sample_token_greedy(
struct llama_context * ctx,
llama_token_data_array * candidates);

View file

@ -360,7 +360,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
// method 5
// blocks of QK elements
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
assert(k % QK == 0);
const int nb = k / QK;
@ -484,7 +484,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
// method 4
// blocks of QK elements
// represented with 2 floats (min + delta) and QK/2 8-bit ints (i.e QK 4-bit unsigned integer factors)
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
assert(k % QK == 0);
const int nb = k / QK;
@ -529,7 +529,7 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
}
// TODO: vectorize
void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {
static void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {
assert(k % QK == 0);
const int nb = k / QK;
@ -561,7 +561,7 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {
}
}
void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
static void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
assert(k % QK == 0);
const int nb = k / QK;

View file

@ -6,6 +6,8 @@
#include <cstdio>
#endif
#define GGML_USE_K_QUANTS //forced on, now that the flag has been removed upstream
#include "llama-util.h"
#include "llama_v3.h"