Merge branch 'master' of github.com:ggerganov/llama.cpp
This commit is contained in:
commit
1ec3f29bd0
48 changed files with 2070 additions and 724 deletions
20
.github/workflows/python-lint.yml
vendored
Normal file
20
.github/workflows/python-lint.yml
vendored
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
name: flake8 Lint
|
||||||
|
|
||||||
|
on: [push, pull_request]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
flake8-lint:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
name: Lint
|
||||||
|
steps:
|
||||||
|
- name: Check out source repository
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
- name: Set up Python environment
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.11"
|
||||||
|
- name: flake8 Lint
|
||||||
|
uses: py-actions/flake8@v2
|
||||||
|
with:
|
||||||
|
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
|
||||||
|
exclude: "examples/*,examples/*/**,*/**/__init__.py"
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -47,6 +47,7 @@ models-mnt
|
||||||
/libllama.so
|
/libllama.so
|
||||||
/llama-bench
|
/llama-bench
|
||||||
/llava-cli
|
/llava-cli
|
||||||
|
/lookahead
|
||||||
/main
|
/main
|
||||||
/metal
|
/metal
|
||||||
/perplexity
|
/perplexity
|
||||||
|
@ -64,6 +65,7 @@ models-mnt
|
||||||
/speculative
|
/speculative
|
||||||
/parallel
|
/parallel
|
||||||
/train-text-from-scratch
|
/train-text-from-scratch
|
||||||
|
/tokenize
|
||||||
/vdot
|
/vdot
|
||||||
/common/build-info.cpp
|
/common/build-info.cpp
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
|
|
|
@ -574,8 +574,12 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
||||||
endif()
|
endif()
|
||||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||||
message(STATUS "PowerPC detected")
|
message(STATUS "PowerPC detected")
|
||||||
add_compile_options(-mcpu=native -mtune=native)
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||||
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
add_compile_options(-mcpu=powerpc64le)
|
||||||
|
else()
|
||||||
|
add_compile_options(-mcpu=native -mtune=native)
|
||||||
|
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
message(STATUS "Unknown architecture")
|
message(STATUS "Unknown architecture")
|
||||||
endif()
|
endif()
|
||||||
|
|
16
Makefile
16
Makefile
|
@ -2,7 +2,7 @@
|
||||||
BUILD_TARGETS = \
|
BUILD_TARGETS = \
|
||||||
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||||
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
|
||||||
speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o
|
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
|
@ -342,6 +342,12 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifneq ($(filter ppc64le%,$(UNAME_M)),)
|
||||||
|
MK_CFLAGS += -mcpu=powerpc64le
|
||||||
|
MK_CXXFLAGS += -mcpu=powerpc64le
|
||||||
|
CUDA_POWER_ARCH = 1
|
||||||
|
endif
|
||||||
|
|
||||||
else
|
else
|
||||||
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||||
|
@ -392,6 +398,8 @@ else
|
||||||
endif #LLAMA_CUDA_NVCC
|
endif #LLAMA_CUDA_NVCC
|
||||||
ifdef CUDA_DOCKER_ARCH
|
ifdef CUDA_DOCKER_ARCH
|
||||||
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
||||||
|
else ifdef CUDA_POWER_ARCH
|
||||||
|
NVCCFLAGS +=
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -arch=native
|
NVCCFLAGS += -arch=native
|
||||||
endif # CUDA_DOCKER_ARCH
|
endif # CUDA_DOCKER_ARCH
|
||||||
|
@ -586,6 +594,9 @@ infill: examples/infill/infill.cpp ggml.o llama.o $(C
|
||||||
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
simple: examples/simple/simple.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
tokenize: examples/tokenize/tokenize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
batched: examples/batched/batched.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -646,6 +657,9 @@ speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS)
|
||||||
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
parallel: examples/parallel/parallel.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef LLAMA_METAL
|
||||||
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
metal: examples/metal/metal.cpp ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
|
||||||
|
|
17
README.md
17
README.md
|
@ -10,7 +10,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
- *No hot topics atm. Open to suggestions about what is hot today*
|
- Using `llama.cpp` with AWS instances: https://github.com/ggerganov/llama.cpp/discussions/4225
|
||||||
|
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
||||||
|
- Collecting Apple Silicon performance stats: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||||
|
|
||||||
----
|
----
|
||||||
|
|
||||||
|
@ -410,19 +412,28 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
||||||
Make sure to have ROCm installed.
|
Make sure to have ROCm installed.
|
||||||
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
|
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
|
||||||
Windows support is coming soon...
|
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_HIPBLAS=1
|
make LLAMA_HIPBLAS=1
|
||||||
```
|
```
|
||||||
- Using `CMake`:
|
- Using `CMake` for Linux:
|
||||||
```bash
|
```bash
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
|
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
|
||||||
cmake --build .
|
cmake --build .
|
||||||
```
|
```
|
||||||
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS):
|
||||||
|
```bash
|
||||||
|
set PATH=%HIP_PATH%\bin;%PATH%
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ ..
|
||||||
|
cmake --build .
|
||||||
|
```
|
||||||
|
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
||||||
|
|
||||||
|
|
||||||
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
||||||
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
|
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
@ -491,8 +492,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
||||||
params.interactive_first = true;
|
params.interactive_first = true;
|
||||||
} else if (arg == "-ins" || arg == "--instruct") {
|
} else if (arg == "-ins" || arg == "--instruct") {
|
||||||
params.instruct = true;
|
params.instruct = true;
|
||||||
|
} else if (arg == "-cml" || arg == "--chatml") {
|
||||||
|
params.chatml = true;
|
||||||
} else if (arg == "--infill") {
|
} else if (arg == "--infill") {
|
||||||
params.infill = true;
|
params.infill = true;
|
||||||
|
} else if (arg == "-dkvc" || arg == "--dump-kv-cache") {
|
||||||
|
params.dump_kv_cache = true;
|
||||||
} else if (arg == "--multiline-input") {
|
} else if (arg == "--multiline-input") {
|
||||||
params.multiline_input = true;
|
params.multiline_input = true;
|
||||||
} else if (arg == "--simple-io") {
|
} else if (arg == "--simple-io") {
|
||||||
|
@ -730,6 +735,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" -i, --interactive run in interactive mode\n");
|
printf(" -i, --interactive run in interactive mode\n");
|
||||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||||
|
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
|
||||||
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
|
||||||
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
|
printf(" -r PROMPT, --reverse-prompt PROMPT\n");
|
||||||
printf(" halt generation at PROMPT, return control in interactive mode\n");
|
printf(" halt generation at PROMPT, return control in interactive mode\n");
|
||||||
|
@ -832,6 +838,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
#endif
|
#endif
|
||||||
printf(" --verbose-prompt print prompt before generation\n");
|
printf(" --verbose-prompt print prompt before generation\n");
|
||||||
|
printf(" -dkvc, --dump-kv-cache\n");
|
||||||
|
printf(" verbose print of the KV cache\n");
|
||||||
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||||
|
@ -931,7 +939,7 @@ void llama_batch_add(
|
||||||
const std::vector<llama_seq_id> & seq_ids,
|
const std::vector<llama_seq_id> & seq_ids,
|
||||||
bool logits) {
|
bool logits) {
|
||||||
batch.token [batch.n_tokens] = id;
|
batch.token [batch.n_tokens] = id;
|
||||||
batch.pos [batch.n_tokens] = pos,
|
batch.pos [batch.n_tokens] = pos;
|
||||||
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
batch.n_seq_id[batch.n_tokens] = seq_ids.size();
|
||||||
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
for (size_t i = 0; i < seq_ids.size(); ++i) {
|
||||||
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
batch.seq_id[batch.n_tokens][i] = seq_ids[i];
|
||||||
|
@ -1072,6 +1080,12 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_to
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool llama_should_add_bos_token(const llama_model * model) {
|
||||||
|
const int add_bos = llama_add_bos_token(model);
|
||||||
|
|
||||||
|
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// YAML utils
|
// YAML utils
|
||||||
//
|
//
|
||||||
|
@ -1188,6 +1202,7 @@ void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const cha
|
||||||
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
|
if (!data_str.empty() && (std::isspace(data_str[0]) || std::isspace(data_str.back()))) {
|
||||||
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
|
||||||
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
|
||||||
|
data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
|
||||||
data_str = "\"" + data_str + "\"";
|
data_str = "\"" + data_str + "\"";
|
||||||
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
|
||||||
return;
|
return;
|
||||||
|
@ -1376,3 +1391,77 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
|
||||||
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// KV cache utils
|
||||||
|
//
|
||||||
|
|
||||||
|
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
|
||||||
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
||||||
|
|
||||||
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
||||||
|
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||||
|
|
||||||
|
llama_kv_cache_view_cell * c_curr = view.cells;
|
||||||
|
llama_seq_id * cs_curr = view.cells_sequences;
|
||||||
|
|
||||||
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
||||||
|
if (i % row_size == 0) {
|
||||||
|
printf("\n%5d: ", i);
|
||||||
|
}
|
||||||
|
int seq_count = 0;
|
||||||
|
for (int j = 0; j < view.n_max_seq; j++) {
|
||||||
|
if (cs_curr[j] >= 0) { seq_count++; }
|
||||||
|
}
|
||||||
|
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n=== Done dumping\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
||||||
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||||
|
|
||||||
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
||||||
|
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
|
||||||
|
|
||||||
|
std::unordered_map<llama_seq_id, size_t> seqs;
|
||||||
|
llama_kv_cache_view_cell * c_curr = view.cells;
|
||||||
|
llama_seq_id * cs_curr = view.cells_sequences;
|
||||||
|
|
||||||
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
||||||
|
for (int j = 0; j < view.n_max_seq; j++) {
|
||||||
|
if (cs_curr[j] < 0) { continue; }
|
||||||
|
if (seqs.find(cs_curr[j]) == seqs.end()) {
|
||||||
|
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||||
|
seqs[cs_curr[j]] = seqs.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("=== Sequence legend: ");
|
||||||
|
for (const auto & it : seqs) {
|
||||||
|
printf("%zu=%d, ", it.second, it.first);
|
||||||
|
}
|
||||||
|
printf("'+'=other sequence ids");
|
||||||
|
|
||||||
|
c_curr = view.cells;
|
||||||
|
cs_curr = view.cells_sequences;
|
||||||
|
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
|
||||||
|
if (i % row_size == 0) {
|
||||||
|
printf("\n%5d: ", i);
|
||||||
|
}
|
||||||
|
for (int j = 0; j < view.n_max_seq; j++) {
|
||||||
|
if (cs_curr[j] >= 0) {
|
||||||
|
const auto & it = seqs.find(cs_curr[j]);
|
||||||
|
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');
|
||||||
|
} else {
|
||||||
|
putchar('.');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
putchar(' ');
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n=== Done dumping\n");
|
||||||
|
}
|
||||||
|
|
|
@ -102,6 +102,7 @@ struct gpt_params {
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
|
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
|
@ -121,6 +122,7 @@ struct gpt_params {
|
||||||
bool numa = false; // attempt optimizations that help on some NUMA systems
|
bool numa = false; // attempt optimizations that help on some NUMA systems
|
||||||
bool verbose_prompt = false; // print prompt tokens before generation
|
bool verbose_prompt = false; // print prompt tokens before generation
|
||||||
bool infill = false; // use infill mode
|
bool infill = false; // use infill mode
|
||||||
|
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
|
||||||
|
|
||||||
// multimodal models (see examples/llava)
|
// multimodal models (see examples/llava)
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
|
@ -200,6 +202,10 @@ std::string llama_detokenize_bpe(
|
||||||
llama_context * ctx,
|
llama_context * ctx,
|
||||||
const std::vector<llama_token> & tokens);
|
const std::vector<llama_token> & tokens);
|
||||||
|
|
||||||
|
// Uses the value from the model metadata if possible, otherwise
|
||||||
|
// defaults to true when model type is SPM, otherwise false.
|
||||||
|
bool llama_should_add_bos_token(const llama_model * model);
|
||||||
|
|
||||||
//
|
//
|
||||||
// YAML utils
|
// YAML utils
|
||||||
//
|
//
|
||||||
|
@ -213,3 +219,13 @@ std::string get_sortable_timestamp();
|
||||||
void dump_non_result_info_yaml(
|
void dump_non_result_info_yaml(
|
||||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||||
|
|
||||||
|
//
|
||||||
|
// KV cache utils
|
||||||
|
//
|
||||||
|
|
||||||
|
// Dump the KV cache view with the number of sequences per cell.
|
||||||
|
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||||
|
|
||||||
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||||
|
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||||
|
|
|
@ -1136,6 +1136,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
|
||||||
fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
|
fprintf(stderr, " --adam-beta2 N AdamW beta2 in interval [0,1). How much to smooth the second moment of gradients. (default %f)\n", params->adam_beta2);
|
||||||
fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
|
fprintf(stderr, " --adam-gclip N AdamW gradient clipping. Disabled when zero. (default %f)\n", params->adam_gclip);
|
||||||
fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
|
fprintf(stderr, " --adam-epsf N AdamW epsilon for convergence test. Disabled when <= zero. (default %f)\n", params->adam_eps_f);
|
||||||
|
fprintf(stderr, " -ngl N, --n-gpu-layers N Number of model layers to offload to GPU (default %d)", params->n_gpu_layers);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1355,6 +1356,17 @@ bool consume_common_train_arg(
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
params->adam_gclip = std::stof(argv[i]);
|
params->adam_gclip = std::stof(argv[i]);
|
||||||
|
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
*invalid_param = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||||
|
params->n_gpu_layers = std::stoi(argv[i]);
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
||||||
|
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
||||||
|
#endif
|
||||||
} else if (arg == "-h" || arg == "--help") {
|
} else if (arg == "-h" || arg == "--help") {
|
||||||
params->print_usage = true;
|
params->print_usage = true;
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -1,317 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# HF baichuan --> gguf conversion
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import struct
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import TYPE_CHECKING, Any
|
|
||||||
import itertools
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from sentencepiece import SentencePieceProcessor # type: ignore[import]
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
|
||||||
import gguf
|
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from typing import TypeAlias
|
|
||||||
|
|
||||||
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
|
|
||||||
|
|
||||||
# reverse HF permute back to original pth layout
|
|
||||||
|
|
||||||
|
|
||||||
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
|
|
||||||
if n_kv_head is not None and n_head != n_kv_head:
|
|
||||||
n_head //= n_kv_head
|
|
||||||
|
|
||||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
|
||||||
.swapaxes(1, 2)
|
|
||||||
.reshape(weights.shape))
|
|
||||||
|
|
||||||
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
|
|
||||||
r = weights.shape[0] // 3
|
|
||||||
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
|
|
||||||
|
|
||||||
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
|
|
||||||
r = weights.shape[0] // 3
|
|
||||||
return weights[r * n_part : r * n_part + r, ...]
|
|
||||||
|
|
||||||
def count_model_parts(dir_model: str) -> int:
|
|
||||||
num_parts = 0
|
|
||||||
|
|
||||||
for filename in os.listdir(dir_model):
|
|
||||||
if filename.startswith("pytorch_model-"):
|
|
||||||
num_parts += 1
|
|
||||||
|
|
||||||
if num_parts > 0:
|
|
||||||
print("gguf: found " + str(num_parts) + " model parts")
|
|
||||||
|
|
||||||
return num_parts
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
|
||||||
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
|
|
||||||
parser.add_argument(
|
|
||||||
"--vocab-only", action="store_true",
|
|
||||||
help="extract only the vocab",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--outfile", type=Path,
|
|
||||||
help="path to write to; default: based on input",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"model", type=Path,
|
|
||||||
help="directory containing model file, or model file itself (*.bin)",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
|
|
||||||
help="output format - use 0 for float32, 1 for float16",
|
|
||||||
)
|
|
||||||
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
args = parse_args()
|
|
||||||
|
|
||||||
dir_model = args.model
|
|
||||||
ftype = args.ftype
|
|
||||||
if not dir_model.is_dir():
|
|
||||||
print(f'Error: {args.model} is not a directory', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
endianess = gguf.GGUFEndian.LITTLE
|
|
||||||
if args.bigendian:
|
|
||||||
endianess = gguf.GGUFEndian.BIG
|
|
||||||
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
|
|
||||||
print(f"gguf: Conversion Endianess {endianess}")
|
|
||||||
# possible tensor data types
|
|
||||||
# ftype == 0 -> float32
|
|
||||||
# ftype == 1 -> float16
|
|
||||||
|
|
||||||
# map from ftype to string
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
|
||||||
fname_out = args.outfile
|
|
||||||
else:
|
|
||||||
# output in the same directory as the model by default
|
|
||||||
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
|
|
||||||
|
|
||||||
print("gguf: loading model "+dir_model.name)
|
|
||||||
|
|
||||||
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
|
|
||||||
hparams = json.load(f)
|
|
||||||
print("hello print: ",hparams["architectures"][0])
|
|
||||||
if hparams["architectures"][0] != "BaichuanForCausalLM" and hparams["architectures"][0] != "BaiChuanForCausalLM":
|
|
||||||
print("Model architecture not supported: " + hparams["architectures"][0])
|
|
||||||
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
# get number of model parts
|
|
||||||
num_parts = count_model_parts(dir_model)
|
|
||||||
print(f"num_parts:{num_parts}\n")
|
|
||||||
ARCH=gguf.MODEL_ARCH.BAICHUAN
|
|
||||||
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
|
||||||
|
|
||||||
print("gguf: get model metadata")
|
|
||||||
|
|
||||||
block_count = hparams["num_hidden_layers"]
|
|
||||||
head_count = hparams["num_attention_heads"]
|
|
||||||
|
|
||||||
if "num_key_value_heads" in hparams:
|
|
||||||
head_count_kv = hparams["num_key_value_heads"]
|
|
||||||
else:
|
|
||||||
head_count_kv = head_count
|
|
||||||
|
|
||||||
if "_name_or_path" in hparams:
|
|
||||||
hf_repo = hparams["_name_or_path"]
|
|
||||||
else:
|
|
||||||
hf_repo = ""
|
|
||||||
|
|
||||||
if "max_sequence_length" in hparams:
|
|
||||||
ctx_length = hparams["max_sequence_length"]
|
|
||||||
elif "max_position_embeddings" in hparams:
|
|
||||||
ctx_length = hparams["max_position_embeddings"]
|
|
||||||
elif "model_max_length" in hparams:
|
|
||||||
ctx_length = hparams["model_max_length"]
|
|
||||||
else:
|
|
||||||
print("gguf: can not find ctx length parameter.")
|
|
||||||
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
|
|
||||||
gguf_writer.add_name(dir_model.name)
|
|
||||||
gguf_writer.add_source_hf_repo(hf_repo)
|
|
||||||
gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
|
||||||
gguf_writer.add_context_length(ctx_length)
|
|
||||||
gguf_writer.add_embedding_length(hparams["hidden_size"])
|
|
||||||
gguf_writer.add_block_count(block_count)
|
|
||||||
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
|
||||||
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
|
|
||||||
gguf_writer.add_head_count(head_count)
|
|
||||||
gguf_writer.add_head_count_kv(head_count_kv)
|
|
||||||
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
|
|
||||||
|
|
||||||
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
|
|
||||||
if "type" in hparams["rope_scaling"]:
|
|
||||||
if hparams["rope_scaling"]["type"] == "linear":
|
|
||||||
gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
|
||||||
gguf_writer.add_rope_scaling_factor(hparams["rope_scaling"]["factor"])
|
|
||||||
|
|
||||||
|
|
||||||
# TOKENIZATION
|
|
||||||
|
|
||||||
print("gguf: get tokenizer metadata")
|
|
||||||
|
|
||||||
tokens: list[bytes] = []
|
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
tokenizer_model_file = dir_model / 'tokenizer.model'
|
|
||||||
if not tokenizer_model_file.is_file():
|
|
||||||
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# vocab type sentencepiece
|
|
||||||
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
|
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
|
|
||||||
vocab_size = hparams.get('vocab_size')
|
|
||||||
if vocab_size is None:
|
|
||||||
vocab_size = tokenizer.vocab_size()
|
|
||||||
|
|
||||||
for i in range(vocab_size):
|
|
||||||
text: bytes
|
|
||||||
score: float
|
|
||||||
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
|
||||||
text = piece.encode("utf-8")
|
|
||||||
score = tokenizer.get_score(i)
|
|
||||||
|
|
||||||
toktype = 1 # defualt to normal token type
|
|
||||||
if tokenizer.is_unknown(i):
|
|
||||||
toktype = 2
|
|
||||||
if tokenizer.is_control(i):
|
|
||||||
toktype = 3
|
|
||||||
|
|
||||||
# toktype = 4 is user-defined = tokens from added_tokens.json
|
|
||||||
|
|
||||||
if tokenizer.is_unused(i):
|
|
||||||
toktype = 5
|
|
||||||
if tokenizer.is_byte(i):
|
|
||||||
toktype = 6
|
|
||||||
|
|
||||||
tokens.append(text)
|
|
||||||
scores.append(score)
|
|
||||||
toktypes.append(toktype)
|
|
||||||
|
|
||||||
added_tokens_file = dir_model / 'added_tokens.json'
|
|
||||||
if added_tokens_file.is_file():
|
|
||||||
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
|
||||||
addtokens_json = json.load(f)
|
|
||||||
|
|
||||||
print("gguf: get added tokens")
|
|
||||||
|
|
||||||
for key in addtokens_json:
|
|
||||||
tokens.append( key.encode("utf-8") )
|
|
||||||
scores.append(-1000.0)
|
|
||||||
toktypes.append(4) # user-defined token type
|
|
||||||
|
|
||||||
|
|
||||||
gguf_writer.add_tokenizer_model("llama")
|
|
||||||
gguf_writer.add_token_list(tokens)
|
|
||||||
gguf_writer.add_token_scores(scores)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, n_vocab = len(tokens))
|
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
|
||||||
|
|
||||||
# TENSORS
|
|
||||||
|
|
||||||
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
|
|
||||||
|
|
||||||
# tensor info
|
|
||||||
print("gguf: get tensor metadata")
|
|
||||||
|
|
||||||
if num_parts == 0:
|
|
||||||
part_names = iter(("pytorch_model.bin",))
|
|
||||||
else:
|
|
||||||
part_names = (
|
|
||||||
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
for part_name in part_names:
|
|
||||||
if args.vocab_only:
|
|
||||||
break
|
|
||||||
print("gguf: loading model part '" + part_name + "'")
|
|
||||||
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
|
|
||||||
|
|
||||||
tmp=model_part
|
|
||||||
for i in range(block_count):
|
|
||||||
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
|
|
||||||
print(f"Unpacking and permuting layer {i}")
|
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
|
|
||||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
|
|
||||||
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
|
|
||||||
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
|
|
||||||
|
|
||||||
for name in model_part.keys():
|
|
||||||
data = model_part[name]
|
|
||||||
# we don't need these
|
|
||||||
if name.endswith(".rotary_emb.inv_freq"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
old_dtype = data.dtype
|
|
||||||
|
|
||||||
# convert any unsupported data types to float32
|
|
||||||
if data.dtype != torch.float16 and data.dtype != torch.float32:
|
|
||||||
data = data.to(torch.float32)
|
|
||||||
|
|
||||||
data = data.squeeze().numpy()
|
|
||||||
|
|
||||||
# map tensor names
|
|
||||||
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
|
|
||||||
if new_name is None:
|
|
||||||
print("Can not map tensor '" + name + "'")
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
n_dims = len(data.shape)
|
|
||||||
data_dtype = data.dtype
|
|
||||||
|
|
||||||
# if f32 desired, convert any float16 to float32
|
|
||||||
if ftype == 0 and data_dtype == np.float16:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
|
||||||
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
|
||||||
data = data.astype(np.float32)
|
|
||||||
|
|
||||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
|
||||||
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
|
||||||
data = data.astype(np.float16)
|
|
||||||
|
|
||||||
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
|
|
||||||
gguf_writer.add_tensor(new_name, data)
|
|
||||||
|
|
||||||
|
|
||||||
print("gguf: write header")
|
|
||||||
gguf_writer.write_header_to_file()
|
|
||||||
print("gguf: write metadata")
|
|
||||||
gguf_writer.write_kv_data_to_file()
|
|
||||||
if not args.vocab_only:
|
|
||||||
print("gguf: write tensors")
|
|
||||||
gguf_writer.write_tensors_to_file()
|
|
||||||
|
|
||||||
gguf_writer.close()
|
|
||||||
|
|
||||||
print(f"gguf: model successfully exported to '{fname_out}'")
|
|
||||||
print("")
|
|
|
@ -59,7 +59,7 @@ class Model:
|
||||||
from safetensors import safe_open
|
from safetensors import safe_open
|
||||||
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
|
||||||
else:
|
else:
|
||||||
ctx = contextlib.nullcontext(torch.load(self.dir_model / part_name, map_location="cpu"))
|
ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
|
||||||
|
|
||||||
with ctx as model_part:
|
with ctx as model_part:
|
||||||
for name in model_part.keys():
|
for name in model_part.keys():
|
||||||
|
@ -193,7 +193,7 @@ class Model:
|
||||||
return gguf.MODEL_ARCH.MPT
|
return gguf.MODEL_ARCH.MPT
|
||||||
if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
|
if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"):
|
||||||
return gguf.MODEL_ARCH.BAICHUAN
|
return gguf.MODEL_ARCH.BAICHUAN
|
||||||
if arch == "FalconForCausalLM":
|
if arch in ("FalconForCausalLM", "RWForCausalLM"):
|
||||||
return gguf.MODEL_ARCH.FALCON
|
return gguf.MODEL_ARCH.FALCON
|
||||||
if arch == "GPTBigCodeForCausalLM":
|
if arch == "GPTBigCodeForCausalLM":
|
||||||
return gguf.MODEL_ARCH.STARCODER
|
return gguf.MODEL_ARCH.STARCODER
|
||||||
|
@ -827,13 +827,14 @@ class StableLMModel(Model):
|
||||||
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
|
||||||
self.gguf_writer.add_block_count(block_count)
|
self.gguf_writer.add_block_count(block_count)
|
||||||
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
|
||||||
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"]*(hparams["hidden_size"] // hparams["num_attention_heads"])))
|
self.gguf_writer.add_rope_dimension_count(int(hparams["rope_pct"] * (hparams["hidden_size"] // hparams["num_attention_heads"])))
|
||||||
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
self.gguf_writer.add_head_count(hparams["num_attention_heads"])
|
||||||
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True)
|
||||||
self.gguf_writer.add_layer_norm_eps(1e-5)
|
self.gguf_writer.add_layer_norm_eps(1e-5)
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
def parse_args() -> argparse.Namespace:
|
def parse_args() -> argparse.Namespace:
|
||||||
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
|
parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
@ -879,20 +880,21 @@ print(f"Loading model: {dir_model.name}")
|
||||||
|
|
||||||
hparams = Model.load_hparams(dir_model)
|
hparams = Model.load_hparams(dir_model)
|
||||||
|
|
||||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
with torch.inference_mode():
|
||||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
|
||||||
|
|
||||||
print("Set model parameters")
|
print("Set model parameters")
|
||||||
model_instance.set_gguf_parameters()
|
model_instance.set_gguf_parameters()
|
||||||
|
|
||||||
print("Set model tokenizer")
|
print("Set model tokenizer")
|
||||||
model_instance.set_vocab()
|
model_instance.set_vocab()
|
||||||
|
|
||||||
if args.vocab_only:
|
if args.vocab_only:
|
||||||
print(f"Exporting model vocab to '{fname_out}'")
|
print(f"Exporting model vocab to '{fname_out}'")
|
||||||
model_instance.write_vocab()
|
model_instance.write_vocab()
|
||||||
else:
|
else:
|
||||||
print(f"Exporting model to '{fname_out}'")
|
print(f"Exporting model to '{fname_out}'")
|
||||||
model_instance.write()
|
model_instance.write()
|
||||||
|
|
||||||
print(f"Model successfully exported to '{fname_out}'")
|
print(f"Model successfully exported to '{fname_out}'")
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import math
|
|
||||||
import struct
|
import struct
|
||||||
import sys
|
import sys
|
||||||
from enum import IntEnum
|
from enum import IntEnum
|
||||||
|
@ -15,11 +14,13 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
class GGMLFormat(IntEnum):
|
class GGMLFormat(IntEnum):
|
||||||
GGML = 0
|
GGML = 0
|
||||||
GGMF = 1
|
GGMF = 1
|
||||||
GGJT = 2
|
GGJT = 2
|
||||||
|
|
||||||
|
|
||||||
class GGMLFType(IntEnum):
|
class GGMLFType(IntEnum):
|
||||||
ALL_F32 = 0
|
ALL_F32 = 0
|
||||||
MOSTLY_F16 = 1
|
MOSTLY_F16 = 1
|
||||||
|
@ -39,6 +40,7 @@ class GGMLFType(IntEnum):
|
||||||
MOSTLY_Q5_K_M = 17
|
MOSTLY_Q5_K_M = 17
|
||||||
MOSTLY_Q6_K = 18
|
MOSTLY_Q6_K = 18
|
||||||
|
|
||||||
|
|
||||||
class Hyperparameters:
|
class Hyperparameters:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
self.n_vocab = self.n_embd = self.n_mult = self.n_head = 0
|
||||||
|
@ -70,6 +72,7 @@ class Hyperparameters:
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
return f'<Hyperparameters: n_vocab={self.n_vocab}, n_embd={self.n_embd}, n_mult={self.n_mult}, n_head={self.n_head}, n_layer={self.n_layer}, n_rot={self.n_rot}, n_ff={self.n_ff}, ftype={self.ftype.name}>'
|
||||||
|
|
||||||
|
|
||||||
class Vocab:
|
class Vocab:
|
||||||
def __init__(self, load_scores = True):
|
def __init__(self, load_scores = True):
|
||||||
self.items = []
|
self.items = []
|
||||||
|
@ -91,6 +94,7 @@ class Vocab:
|
||||||
self.items.append((item_text, item_score))
|
self.items.append((item_text, item_score))
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
|
|
||||||
class Tensor:
|
class Tensor:
|
||||||
def __init__(self, use_padding = True):
|
def __init__(self, use_padding = True):
|
||||||
self.name = None
|
self.name = None
|
||||||
|
@ -124,6 +128,7 @@ class Tensor:
|
||||||
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
|
||||||
return offset - orig_offset
|
return offset - orig_offset
|
||||||
|
|
||||||
|
|
||||||
class GGMLModel:
|
class GGMLModel:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.hyperparameters = None
|
self.hyperparameters = None
|
||||||
|
@ -160,8 +165,8 @@ class GGMLModel:
|
||||||
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
if ftype not in (GGMLFType.ALL_F32, GGMLFType.MOSTLY_F16):
|
||||||
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
err = 'Quantizations changed in GGJTv2. Can only convert unquantized GGML files older than GGJTv2.'
|
||||||
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
elif (self.file_format == GGMLFormat.GGJT and self.format_version == 2):
|
||||||
if ftype in ( GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
if ftype in (GGMLFType.MOSTLY_Q4_0, GGMLFType.MOSTLY_Q4_1,
|
||||||
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
GGMLFType.MOSTLY_Q4_1_SOME_F16, GGMLFType.MOSTLY_Q8_0):
|
||||||
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
err = 'Q4 and Q8 quantizations changed in GGJTv3.'
|
||||||
if len(err) > 0:
|
if len(err) > 0:
|
||||||
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
|
raise ValueError(f'{err} Sorry, your {self.file_format.name}v{self.format_version} file of type {ftype.name} is not eligible for conversion.')
|
||||||
|
@ -188,6 +193,7 @@ class GGMLModel:
|
||||||
hp.set_n_ff(self)
|
hp.set_n_ff(self)
|
||||||
return offset
|
return offset
|
||||||
|
|
||||||
|
|
||||||
class GGMLToGGUF:
|
class GGMLToGGUF:
|
||||||
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
def __init__(self, ggml_model, data, cfg, params_override = None, vocab_override = None, special_vocab = None):
|
||||||
hp = ggml_model.hyperparameters
|
hp = ggml_model.hyperparameters
|
||||||
|
@ -218,7 +224,7 @@ class GGMLToGGUF:
|
||||||
gguf_writer = gguf.GGUFWriter(
|
gguf_writer = gguf.GGUFWriter(
|
||||||
self.cfg.output,
|
self.cfg.output,
|
||||||
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
|
||||||
use_temp_file = False )
|
use_temp_file = False)
|
||||||
self.add_params(gguf_writer)
|
self.add_params(gguf_writer)
|
||||||
self.add_vocab(gguf_writer)
|
self.add_vocab(gguf_writer)
|
||||||
if self.special_vocab is not None:
|
if self.special_vocab is not None:
|
||||||
|
@ -342,7 +348,8 @@ class GGMLToGGUF:
|
||||||
mapped_name,
|
mapped_name,
|
||||||
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
|
||||||
raw_shape = tempdims,
|
raw_shape = tempdims,
|
||||||
raw_dtype = tensor.dtype )
|
raw_dtype = tensor.dtype)
|
||||||
|
|
||||||
|
|
||||||
def handle_metadata(cfg, hp):
|
def handle_metadata(cfg, hp):
|
||||||
import convert
|
import convert
|
||||||
|
@ -366,38 +373,40 @@ def handle_metadata(cfg, hp):
|
||||||
raise ValueError('Unable to load metadata')
|
raise ValueError('Unable to load metadata')
|
||||||
vocab = convert.load_vocab(
|
vocab = convert.load_vocab(
|
||||||
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
||||||
cfg.vocabtype )
|
cfg.vocabtype)
|
||||||
# FIXME: Respect cfg.vocab_dir?
|
# FIXME: Respect cfg.vocab_dir?
|
||||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
||||||
load_merges = cfg.vocabtype == 'bpe',
|
load_merges = cfg.vocabtype == 'bpe',
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
convert.check_vocab_size(params, vocab)
|
convert.check_vocab_size(params, vocab)
|
||||||
return (params, vocab, svocab)
|
return (params, vocab, svocab)
|
||||||
|
|
||||||
|
|
||||||
def handle_args():
|
def handle_args():
|
||||||
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
parser = argparse.ArgumentParser(description = 'Convert GGML models to GGUF')
|
||||||
parser.add_argument('--input', '-i', type = Path, required = True,
|
parser.add_argument('--input', '-i', type = Path, required = True,
|
||||||
help = 'Input GGMLv3 filename')
|
help = 'Input GGMLv3 filename')
|
||||||
parser.add_argument('--output', '-o', type = Path, required = True,
|
parser.add_argument('--output', '-o', type = Path, required = True,
|
||||||
help ='Output GGUF filename')
|
help ='Output GGUF filename')
|
||||||
parser.add_argument('--name',
|
parser.add_argument('--name',
|
||||||
help = 'Set model name')
|
help = 'Set model name')
|
||||||
parser.add_argument('--desc',
|
parser.add_argument('--desc',
|
||||||
help = 'Set model description')
|
help = 'Set model description')
|
||||||
parser.add_argument('--gqa', type = int, default = 1,
|
parser.add_argument('--gqa', type = int, default = 1,
|
||||||
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
help = 'grouped-query attention factor (use 8 for LLaMA2 70B)')
|
||||||
parser.add_argument('--eps', default = '5.0e-06',
|
parser.add_argument('--eps', default = '5.0e-06',
|
||||||
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
help = 'RMS norm eps: Use 1e-6 for LLaMA1 and OpenLLaMA, use 1e-5 for LLaMA2')
|
||||||
parser.add_argument('--context-length', '-c', type=int, default = 2048,
|
parser.add_argument('--context-length', '-c', type=int, default = 2048,
|
||||||
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
help = 'Default max context length: LLaMA1 is typically 2048, LLaMA2 is typically 4096')
|
||||||
parser.add_argument('--model-metadata-dir', '-m', type = Path,
|
parser.add_argument('--model-metadata-dir', '-m', type = Path,
|
||||||
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
help ='Load HuggingFace/.pth vocab and metadata from the specified directory')
|
||||||
parser.add_argument("--vocab-dir", type=Path,
|
parser.add_argument("--vocab-dir", type=Path,
|
||||||
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
|
||||||
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
|
parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm",
|
||||||
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)")
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
cfg = handle_args()
|
cfg = handle_args()
|
||||||
print(f'* Using config: {cfg}')
|
print(f'* Using config: {cfg}')
|
||||||
|
@ -407,7 +416,7 @@ def main():
|
||||||
data = np.memmap(cfg.input, mode = 'r')
|
data = np.memmap(cfg.input, mode = 'r')
|
||||||
model = GGMLModel()
|
model = GGMLModel()
|
||||||
print('* Scanning GGML input file')
|
print('* Scanning GGML input file')
|
||||||
offset = model.load(data, 0)
|
offset = model.load(data, 0) # noqa
|
||||||
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
print(f'* GGML model hyperparameters: {model.hyperparameters}')
|
||||||
vocab_override = None
|
vocab_override = None
|
||||||
params_override = None
|
params_override = None
|
||||||
|
@ -422,12 +431,15 @@ def main():
|
||||||
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
|
||||||
if model.file_format == GGMLFormat.GGML:
|
if model.file_format == GGMLFormat.GGML:
|
||||||
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
|
||||||
converter = GGMLToGGUF(model, data, cfg,
|
converter = GGMLToGGUF(
|
||||||
|
model, data, cfg,
|
||||||
params_override = params_override,
|
params_override = params_override,
|
||||||
vocab_override = vocab_override,
|
vocab_override = vocab_override,
|
||||||
special_vocab = special_vocab )
|
special_vocab = special_vocab
|
||||||
|
)
|
||||||
converter.save()
|
converter.save()
|
||||||
print(f'* Successful completion. Output saved to: {cfg.output}')
|
print(f'* Successful completion. Output saved to: {cfg.output}')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
|
@ -9,6 +9,7 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
|
|
||||||
def _flatten_dict(dct, tensors, prefix=None):
|
def _flatten_dict(dct, tensors, prefix=None):
|
||||||
assert isinstance(dct, dict)
|
assert isinstance(dct, dict)
|
||||||
for key in dct.keys():
|
for key in dct.keys():
|
||||||
|
@ -21,6 +22,7 @@ def _flatten_dict(dct, tensors, prefix=None):
|
||||||
raise ValueError(type(dct[key]))
|
raise ValueError(type(dct[key]))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
||||||
tokenizer_path = dir_model / 'adept_vocab.model'
|
tokenizer_path = dir_model / 'adept_vocab.model'
|
||||||
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
|
||||||
|
@ -54,6 +56,7 @@ def _get_sentencepiece_tokenizer_info(dir_model: Path):
|
||||||
pass
|
pass
|
||||||
return tokens, scores, toktypes
|
return tokens, scores, toktypes
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
|
||||||
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
|
||||||
|
@ -125,6 +128,5 @@ def main():
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
||||||
|
|
61
convert.py
61
convert.py
|
@ -46,6 +46,7 @@ DEFAULT_CONCURRENCY = 8
|
||||||
# data types
|
# data types
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class DataType:
|
class DataType:
|
||||||
name: str
|
name: str
|
||||||
|
@ -55,15 +56,18 @@ class DataType:
|
||||||
def elements_to_bytes(self, n_elements: int) -> int:
|
def elements_to_bytes(self, n_elements: int) -> int:
|
||||||
return n_elements * self.dtype.itemsize
|
return n_elements * self.dtype.itemsize
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class UnquantizedDataType(DataType):
|
class UnquantizedDataType(DataType):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
DT_F16 = UnquantizedDataType('F16', dtype = np.dtype(np.float16), valid_conversions = ['F32', 'Q8_0'])
|
||||||
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
DT_F32 = UnquantizedDataType('F32', dtype = np.dtype(np.float32), valid_conversions = ['F16', 'Q8_0'])
|
||||||
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
DT_I32 = UnquantizedDataType('I32', dtype = np.dtype(np.int16), valid_conversions = [])
|
||||||
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
DT_BF16 = UnquantizedDataType('BF16', dtype = np.dtype(np.uint16), valid_conversions = ['F32', 'F16', 'Q8_0'])
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class QuantizedDataType(DataType):
|
class QuantizedDataType(DataType):
|
||||||
block_size: int
|
block_size: int
|
||||||
|
@ -77,6 +81,7 @@ class QuantizedDataType(DataType):
|
||||||
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
assert n_elements % self.block_size == 0, f'Invalid number of elements {n_elements} for {self.name} with block size {self.block_size}'
|
||||||
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
return self.quantized_dtype.itemsize * (n_elements // self.block_size)
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class Q8_0QuantizedDataType(QuantizedDataType):
|
class Q8_0QuantizedDataType(QuantizedDataType):
|
||||||
# Mini Q8_0 quantization in Python!
|
# Mini Q8_0 quantization in Python!
|
||||||
|
@ -86,6 +91,7 @@ class Q8_0QuantizedDataType(QuantizedDataType):
|
||||||
n_blocks = arr.size // self.block_size
|
n_blocks = arr.size // self.block_size
|
||||||
blocks = arr.reshape((n_blocks, self.block_size))
|
blocks = arr.reshape((n_blocks, self.block_size))
|
||||||
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
# Much faster implementation of block quantization contributed by @Cebtenzzre
|
||||||
|
|
||||||
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
def quantize_blocks_q8_0(blocks: NDArray) -> Iterable[tuple[Any, Any]]:
|
||||||
d = abs(blocks).max(axis = 1) / np.float32(127)
|
d = abs(blocks).max(axis = 1) / np.float32(127)
|
||||||
with np.errstate(divide = 'ignore'):
|
with np.errstate(divide = 'ignore'):
|
||||||
|
@ -94,10 +100,11 @@ class Q8_0QuantizedDataType(QuantizedDataType):
|
||||||
yield from zip(d, qs)
|
yield from zip(d, qs)
|
||||||
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
return np.fromiter(quantize_blocks_q8_0(blocks), count = n_blocks, dtype = self.quantized_dtype)
|
||||||
|
|
||||||
|
|
||||||
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
DT_Q8_0 = Q8_0QuantizedDataType('Q8_0',
|
||||||
dtype = np.dtype(np.float32), valid_conversions = [],
|
dtype = np.dtype(np.float32), valid_conversions = [],
|
||||||
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
ggml_type = gguf.GGMLQuantizationType.Q8_0, block_size = 32,
|
||||||
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
|
quantized_dtype = np.dtype([('d', '<f2'), ('qs', 'i1', (32,))]))
|
||||||
|
|
||||||
# Quantized types skipped here because they may also map to np.float32
|
# Quantized types skipped here because they may also map to np.float32
|
||||||
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
|
NUMPY_TYPE_TO_DATA_TYPE: dict[np.dtype[Any], DataType] = {}
|
||||||
|
@ -116,6 +123,8 @@ SAFETENSORS_DATA_TYPES: dict[str, DataType] = {
|
||||||
# TODO: match this with `llama_ftype`
|
# TODO: match this with `llama_ftype`
|
||||||
# TODO: rename to LLAMAFileType
|
# TODO: rename to LLAMAFileType
|
||||||
# TODO: move to `gguf.py`
|
# TODO: move to `gguf.py`
|
||||||
|
|
||||||
|
|
||||||
class GGMLFileType(enum.IntEnum):
|
class GGMLFileType(enum.IntEnum):
|
||||||
AllF32 = 0
|
AllF32 = 0
|
||||||
MostlyF16 = 1 # except 1d tensors
|
MostlyF16 = 1 # except 1d tensors
|
||||||
|
@ -128,6 +137,7 @@ class GGMLFileType(enum.IntEnum):
|
||||||
# 1D tensors are always F32.
|
# 1D tensors are always F32.
|
||||||
return dt if len(tensor.shape) > 1 else DT_F32
|
return dt if len(tensor.shape) > 1 else DT_F32
|
||||||
|
|
||||||
|
|
||||||
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
||||||
GGMLFileType.AllF32 : DT_F32,
|
GGMLFileType.AllF32 : DT_F32,
|
||||||
GGMLFileType.MostlyF16 : DT_F16,
|
GGMLFileType.MostlyF16 : DT_F16,
|
||||||
|
@ -138,6 +148,7 @@ GGML_FILE_TYPE_TO_DATA_TYPE: dict[GGMLFileType, DataType] = {
|
||||||
# hparams loading
|
# hparams loading
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Params:
|
class Params:
|
||||||
n_vocab: int
|
n_vocab: int
|
||||||
|
@ -167,11 +178,11 @@ class Params:
|
||||||
|
|
||||||
# try transformer naming first
|
# try transformer naming first
|
||||||
if "model.layers.0.self_attn.q_proj.weight" in model:
|
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||||
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
||||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
||||||
else:
|
else:
|
||||||
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||||
|
|
||||||
if n_layer < 1:
|
if n_layer < 1:
|
||||||
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
||||||
|
@ -308,7 +319,7 @@ class BpeVocab:
|
||||||
(item['content'], item['id'])
|
(item['content'], item['id'])
|
||||||
for item in tokenizer_json.get('added_tokens', [])
|
for item in tokenizer_json.get('added_tokens', [])
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
# Added tokens here can be duplicates of the main vocabulary.
|
||||||
if item['content'] not in self.bpe_tokenizer )
|
if item['content'] not in self.bpe_tokenizer)
|
||||||
|
|
||||||
vocab_size: int = len(self.bpe_tokenizer)
|
vocab_size: int = len(self.bpe_tokenizer)
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
|
@ -326,7 +337,6 @@ class BpeVocab:
|
||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.bpe_tokenizer
|
tokenizer = self.bpe_tokenizer
|
||||||
from transformers.models.gpt2 import tokenization_gpt2
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
||||||
|
|
||||||
for i, _ in enumerate(tokenizer):
|
for i, _ in enumerate(tokenizer):
|
||||||
|
@ -406,6 +416,7 @@ class SentencePieceVocab:
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -413,13 +424,14 @@ Vocab: TypeAlias = 'BpeVocab | SentencePieceVocab'
|
||||||
# TODO: reuse (probably move to gguf.py?)
|
# TODO: reuse (probably move to gguf.py?)
|
||||||
#
|
#
|
||||||
|
|
||||||
|
|
||||||
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
|
||||||
#print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
|
||||||
if n_head_kv is not None and n_head != n_head_kv:
|
if n_head_kv is not None and n_head != n_head_kv:
|
||||||
n_head = n_head_kv
|
n_head = n_head_kv
|
||||||
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
|
||||||
.swapaxes(1, 2)
|
.swapaxes(1, 2)
|
||||||
.reshape(weights.shape))
|
.reshape(weights.shape))
|
||||||
|
|
||||||
|
|
||||||
class Tensor(metaclass=ABCMeta):
|
class Tensor(metaclass=ABCMeta):
|
||||||
|
@ -500,7 +512,7 @@ class LazyTensor:
|
||||||
ret = self._load()
|
ret = self._load()
|
||||||
# Should be okay if it maps to the same numpy type?
|
# Should be okay if it maps to the same numpy type?
|
||||||
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
|
assert ret.data_type == self.data_type or (self.data_type.dtype == ret.data_type.dtype), \
|
||||||
(self.data_type, ret.data_type, self.description)
|
(self.data_type, ret.data_type, self.description)
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def astype(self, data_type: DataType) -> LazyTensor:
|
def astype(self, data_type: DataType) -> LazyTensor:
|
||||||
|
@ -588,6 +600,7 @@ def permute_lazy(lazy_tensor: LazyTensor, n_head: int, n_head_kv: int) -> LazyTe
|
||||||
return lazy_tensor.load().permute(n_head, n_head_kv)
|
return lazy_tensor.load().permute(n_head, n_head_kv)
|
||||||
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
return LazyTensor(load, lazy_tensor.shape, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
|
|
||||||
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_kv: int) -> LazyTensor:
|
||||||
def load() -> Tensor:
|
def load() -> Tensor:
|
||||||
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
return lazy_tensor.load().permute_part(n_part, n_head, n_head_kv)
|
||||||
|
@ -595,6 +608,7 @@ def permute_part_lazy(lazy_tensor: LazyTensor, n_part: int, n_head: int, n_head_
|
||||||
s[0] = s[0] // 3
|
s[0] = s[0] // 3
|
||||||
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
return LazyTensor(load, s, lazy_tensor.data_type, f'permute({n_head}, {n_head_kv}) ' + lazy_tensor.description)
|
||||||
|
|
||||||
|
|
||||||
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor:
|
||||||
def load() -> Tensor:
|
def load() -> Tensor:
|
||||||
return lazy_tensor.load().part(n_part)
|
return lazy_tensor.load().part(n_part)
|
||||||
|
@ -690,6 +704,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
|
||||||
data_base_path=pickle_paths[0][:-4],
|
data_base_path=pickle_paths[0][:-4],
|
||||||
zip_file=zf)
|
zip_file=zf)
|
||||||
model = unpickler.load()
|
model = unpickler.load()
|
||||||
|
if 'model' in model: model = model['model']
|
||||||
as_dict = dict(model.items())
|
as_dict = dict(model.items())
|
||||||
return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
|
return ModelPlus(model=as_dict, paths=[path], format='torch', vocab=None)
|
||||||
|
|
||||||
|
@ -743,6 +758,7 @@ def lazy_load_file(path: Path) -> ModelPlus:
|
||||||
In = TypeVar('In')
|
In = TypeVar('In')
|
||||||
Out = TypeVar('Out')
|
Out = TypeVar('Out')
|
||||||
|
|
||||||
|
|
||||||
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], concurrency: int, max_workers: int | None = None, use_processpool_executor: bool = False) -> Iterable[Out]:
|
||||||
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
'''Parallel map, but with backpressure. If the caller doesn't call `next`
|
||||||
fast enough, this will stop calling `func` at some point rather than
|
fast enough, this will stop calling `func` at some point rather than
|
||||||
|
@ -777,6 +793,7 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc
|
||||||
break
|
break
|
||||||
yield result
|
yield result
|
||||||
|
|
||||||
|
|
||||||
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||||
if params.n_vocab != vocab.vocab_size:
|
if params.n_vocab != vocab.vocab_size:
|
||||||
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
assert isinstance(vocab, BpeVocab) or isinstance(vocab, SentencePieceVocab)
|
||||||
|
@ -795,7 +812,7 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
|
||||||
|
|
||||||
|
|
||||||
class OutputFile:
|
class OutputFile:
|
||||||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||||
|
|
||||||
def add_meta_arch(self, params: Params) -> None:
|
def add_meta_arch(self, params: Params) -> None:
|
||||||
|
@ -875,7 +892,7 @@ class OutputFile:
|
||||||
self.gguf.close()
|
self.gguf.close()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
|
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None:
|
||||||
check_vocab_size(params, vocab)
|
check_vocab_size(params, vocab)
|
||||||
|
|
||||||
of = OutputFile(fname_out, endianess=endianess)
|
of = OutputFile(fname_out, endianess=endianess)
|
||||||
|
@ -937,8 +954,9 @@ class OutputFile:
|
||||||
|
|
||||||
of.close()
|
of.close()
|
||||||
|
|
||||||
|
|
||||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0)+".weight"].data_type
|
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) +".weight"].data_type
|
||||||
|
|
||||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
||||||
return GGMLFileType.AllF32
|
return GGMLFileType.AllF32
|
||||||
|
@ -951,10 +969,12 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
||||||
|
|
||||||
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
raise Exception(f"Unexpected combination of types: {name_to_type}")
|
||||||
|
|
||||||
|
|
||||||
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
||||||
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
||||||
for (name, tensor) in model.items()}
|
for (name, tensor) in model.items()}
|
||||||
|
|
||||||
|
|
||||||
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
tmap = gguf.TensorNameMap(ARCH, params.n_layer)
|
||||||
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, []))
|
||||||
|
@ -967,7 +987,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
print(f"Permuting layer {i}")
|
print(f"Permuting layer {i}")
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
|
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
|
||||||
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
|
||||||
#tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
# tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
|
||||||
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
|
||||||
print(f"Unpacking and permuting layer {i}")
|
print(f"Unpacking and permuting layer {i}")
|
||||||
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
|
||||||
|
@ -992,6 +1012,7 @@ def convert_model_names(model: LazyModel, params: Params) -> LazyModel:
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
def nth_multifile_path(path: Path, n: int) -> Path | None:
|
||||||
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
'''Given any path belonging to a multi-file model (e.g. foo.bin.1), return
|
||||||
the nth path in the model.
|
the nth path in the model.
|
||||||
|
@ -1173,8 +1194,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
# FIXME: Try to respect vocab_dir somehow?
|
# FIXME: Try to respect vocab_dir somehow?
|
||||||
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
vocab = load_vocab(args.vocab_dir or args.model, args.vocabtype)
|
||||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||||
load_merges = args.vocabtype == 'bpe',
|
load_merges = args.vocabtype == 'bpe',
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
outfile = args.outfile
|
outfile = args.outfile
|
||||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab)
|
||||||
print(f"Wrote {outfile}")
|
print(f"Wrote {outfile}")
|
||||||
|
@ -1187,8 +1208,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
vocab = load_vocab(vocab_dir, args.vocabtype)
|
vocab = load_vocab(vocab_dir, args.vocabtype)
|
||||||
# FIXME: Try to respect vocab_dir somehow?
|
# FIXME: Try to respect vocab_dir somehow?
|
||||||
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
special_vocab = gguf.SpecialVocab(model_plus.paths[0].parent,
|
||||||
load_merges = args.vocabtype == 'bpe',
|
load_merges = args.vocabtype == 'bpe',
|
||||||
n_vocab = vocab.vocab_size)
|
n_vocab = vocab.vocab_size)
|
||||||
|
|
||||||
model = model_plus.model
|
model = model_plus.model
|
||||||
model = convert_model_names(model, params)
|
model = convert_model_names(model, params)
|
||||||
|
|
BIN
docs/llama-star/idea-arch.key
Executable file
BIN
docs/llama-star/idea-arch.key
Executable file
Binary file not shown.
BIN
docs/llama-star/idea-arch.pdf
Normal file
BIN
docs/llama-star/idea-arch.pdf
Normal file
Binary file not shown.
|
@ -24,6 +24,7 @@ else()
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
add_subdirectory(llava)
|
add_subdirectory(llava)
|
||||||
add_subdirectory(main)
|
add_subdirectory(main)
|
||||||
|
add_subdirectory(tokenize)
|
||||||
add_subdirectory(parallel)
|
add_subdirectory(parallel)
|
||||||
add_subdirectory(perplexity)
|
add_subdirectory(perplexity)
|
||||||
add_subdirectory(quantize)
|
add_subdirectory(quantize)
|
||||||
|
@ -31,6 +32,7 @@ else()
|
||||||
add_subdirectory(save-load-state)
|
add_subdirectory(save-load-state)
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
add_subdirectory(speculative)
|
add_subdirectory(speculative)
|
||||||
|
add_subdirectory(lookahead)
|
||||||
add_subdirectory(train-text-from-scratch)
|
add_subdirectory(train-text-from-scratch)
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
add_subdirectory(metal)
|
add_subdirectory(metal)
|
||||||
|
|
|
@ -153,7 +153,7 @@ while n_cur <= n_len {
|
||||||
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
// const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
|
||||||
|
|
||||||
// is it an end of stream? -> mark the stream as finished
|
// is it an end of stream? -> mark the stream as finished
|
||||||
if new_token_id == llama_token_eos(context) || n_cur == n_len {
|
if new_token_id == llama_token_eos(model) || n_cur == n_len {
|
||||||
i_batch[i] = -1
|
i_batch[i] = -1
|
||||||
// print("")
|
// print("")
|
||||||
if n_parallel > 1 {
|
if n_parallel > 1 {
|
||||||
|
|
|
@ -21,7 +21,7 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
||||||
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||||
```
|
```
|
||||||
|
|
||||||
Finetune output files will be saved every N iterations (config with `--save-every N`).
|
**Only llama based models are supported!** The output files will be saved every N iterations (config with `--save-every N`).
|
||||||
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
|
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
|
||||||
So in above example after 10 iterations these files will be written:
|
So in above example after 10 iterations these files will be written:
|
||||||
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
|
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
|
||||||
|
|
|
@ -3,9 +3,7 @@
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import gguf
|
import gguf
|
||||||
import os
|
|
||||||
import struct
|
import struct
|
||||||
import sys
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
|
@ -548,35 +548,35 @@ static void randomize_lora(struct my_llama_lora * lora, int seed, float mean, fl
|
||||||
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
|
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
|
||||||
|
|
||||||
randomize_tensor_normal(lora->tok_embeddings_a, rnd);
|
randomize_tensor_normal(lora->tok_embeddings_a, rnd);
|
||||||
randomize_tensor_normal(lora->tok_embeddings_b, rnd);
|
ggml_set_zero(lora->tok_embeddings_b);
|
||||||
randomize_tensor_normal(lora->norm_a, rnd);
|
randomize_tensor_normal(lora->norm_a, rnd);
|
||||||
randomize_tensor_normal(lora->norm_b, rnd);
|
ggml_set_zero(lora->norm_b);
|
||||||
randomize_tensor_normal(lora->output_a, rnd);
|
randomize_tensor_normal(lora->output_a, rnd);
|
||||||
randomize_tensor_normal(lora->output_b, rnd);
|
ggml_set_zero(lora->output_b);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
auto & layer = lora->layers[i];
|
auto & layer = lora->layers[i];
|
||||||
randomize_tensor_normal(layer.attention_norm_a, rnd);
|
randomize_tensor_normal(layer.attention_norm_a, rnd);
|
||||||
randomize_tensor_normal(layer.attention_norm_b, rnd);
|
ggml_set_zero(layer.attention_norm_b);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.wq_a, rnd);
|
randomize_tensor_normal(layer.wq_a, rnd);
|
||||||
randomize_tensor_normal(layer.wq_b, rnd);
|
ggml_set_zero(layer.wq_b);
|
||||||
randomize_tensor_normal(layer.wk_a, rnd);
|
randomize_tensor_normal(layer.wk_a, rnd);
|
||||||
randomize_tensor_normal(layer.wk_b, rnd);
|
ggml_set_zero(layer.wk_b);
|
||||||
randomize_tensor_normal(layer.wv_a, rnd);
|
randomize_tensor_normal(layer.wv_a, rnd);
|
||||||
randomize_tensor_normal(layer.wv_b, rnd);
|
ggml_set_zero(layer.wv_b);
|
||||||
randomize_tensor_normal(layer.wo_a, rnd);
|
randomize_tensor_normal(layer.wo_a, rnd);
|
||||||
randomize_tensor_normal(layer.wo_b, rnd);
|
ggml_set_zero(layer.wo_b);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.ffn_norm_a, rnd);
|
randomize_tensor_normal(layer.ffn_norm_a, rnd);
|
||||||
randomize_tensor_normal(layer.ffn_norm_b, rnd);
|
ggml_set_zero(layer.ffn_norm_b);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.w1_a, rnd);
|
randomize_tensor_normal(layer.w1_a, rnd);
|
||||||
randomize_tensor_normal(layer.w1_b, rnd);
|
ggml_set_zero(layer.w1_b);
|
||||||
randomize_tensor_normal(layer.w2_a, rnd);
|
randomize_tensor_normal(layer.w2_a, rnd);
|
||||||
randomize_tensor_normal(layer.w2_b, rnd);
|
ggml_set_zero(layer.w2_b);
|
||||||
randomize_tensor_normal(layer.w3_a, rnd);
|
randomize_tensor_normal(layer.w3_a, rnd);
|
||||||
randomize_tensor_normal(layer.w3_b, rnd);
|
ggml_set_zero(layer.w3_b);
|
||||||
}
|
}
|
||||||
|
|
||||||
free_random_normal_distribution(rnd);
|
free_random_normal_distribution(rnd);
|
||||||
|
@ -1460,17 +1460,6 @@ static bool train_params_parse(int argc, char ** argv, struct train_params * par
|
||||||
}
|
}
|
||||||
params->n_rank_w3 = std::stoi(argv[i]);
|
params->n_rank_w3 = std::stoi(argv[i]);
|
||||||
params->custom_n_rank_w3 = true;
|
params->custom_n_rank_w3 = true;
|
||||||
} else if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
|
|
||||||
params->common.n_gpu_layers = std::stoi(argv[i]);
|
|
||||||
#else
|
|
||||||
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
|
|
||||||
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
|
|
||||||
#endif
|
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
train_print_usage(argc, argv, &default_params);
|
train_print_usage(argc, argv, &default_params);
|
||||||
|
|
|
@ -146,6 +146,13 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
if (params.chatml) {
|
||||||
|
printf("\n************\n");
|
||||||
|
printf("%s: please use the 'main' tool for chatml mode\n", __func__);
|
||||||
|
printf("************\n\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
printf("\n************\n");
|
printf("\n************\n");
|
||||||
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
|
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
|
||||||
|
@ -230,7 +237,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s\n", get_system_info(params).c_str());
|
LOG_TEE("%s\n", get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
bool suff_rm_leading_spc = params.escape;
|
bool suff_rm_leading_spc = params.escape;
|
||||||
|
|
|
@ -208,9 +208,10 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||||
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
|
||||||
|
|
||||||
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
|
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
|
||||||
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, true);
|
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
|
||||||
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||||
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
|
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
|
||||||
|
|
||||||
|
|
|
@ -127,7 +127,14 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long
|
||||||
fclose(file);
|
fclose(file);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
fread(buffer, 1, fileSize, file); // Read the file into the buffer
|
errno = 0;
|
||||||
|
size_t ret = fread(buffer, 1, fileSize, file); // Read the file into the buffer
|
||||||
|
if (ferror(file)) {
|
||||||
|
die_fmt("read error: %s", strerror(errno));
|
||||||
|
}
|
||||||
|
if (ret != (size_t) fileSize) {
|
||||||
|
die("unexpectedly reached end of file");
|
||||||
|
}
|
||||||
fclose(file); // Close the file
|
fclose(file); // Close the file
|
||||||
|
|
||||||
*bytesOut = buffer;
|
*bytesOut = buffer;
|
||||||
|
|
5
examples/lookahead/CMakeLists.txt
Normal file
5
examples/lookahead/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET lookahead)
|
||||||
|
add_executable(${TARGET} lookahead.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
487
examples/lookahead/lookahead.cpp
Normal file
487
examples/lookahead/lookahead.cpp
Normal file
|
@ -0,0 +1,487 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
struct ngram_data {
|
||||||
|
bool active = false;
|
||||||
|
|
||||||
|
llama_seq_id seq_id = -1;
|
||||||
|
|
||||||
|
std::vector<int> i_batch;
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
};
|
||||||
|
|
||||||
|
// n-gram container
|
||||||
|
struct ngram_container {
|
||||||
|
ngram_container(int n_vocab, int N, int G) {
|
||||||
|
cnt.resize(n_vocab);
|
||||||
|
head.resize(n_vocab);
|
||||||
|
tokens.resize(n_vocab * G * (N - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
int n_total = 0;
|
||||||
|
|
||||||
|
std::vector<int> cnt;
|
||||||
|
std::vector<int> head;
|
||||||
|
|
||||||
|
// [n_vocab][G][N - 1]
|
||||||
|
// for each token of the vocab, keep a ring-buffer of capacity G of n-grams of size N - 1
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
};
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
gpt_params params;
|
||||||
|
|
||||||
|
if (gpt_params_parse(argc, argv, params) == false) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int W = 15; // lookahead window
|
||||||
|
const int N = 5; // n-gram size
|
||||||
|
const int G = 15; // max verification n-grams
|
||||||
|
|
||||||
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
|
#ifndef LOG_DISABLE_LOGS
|
||||||
|
log_set_target(log_filename_generator("lookahead", "log"));
|
||||||
|
LOG_TEE("Log start\n");
|
||||||
|
log_dump_cmdline(argc, argv);
|
||||||
|
#endif // LOG_DISABLE_LOGS
|
||||||
|
|
||||||
|
// init llama.cpp
|
||||||
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
|
llama_model * model = NULL;
|
||||||
|
llama_context * ctx = NULL;
|
||||||
|
|
||||||
|
// load the target model
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||||
|
|
||||||
|
// Tokenize the prompt
|
||||||
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
LOG("add_bos tgt: %d\n", add_bos);
|
||||||
|
|
||||||
|
std::vector<llama_token> inp;
|
||||||
|
std::vector<llama_token> all;
|
||||||
|
|
||||||
|
inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||||
|
all = inp;
|
||||||
|
|
||||||
|
const int max_context_size = llama_n_ctx(ctx);
|
||||||
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
||||||
|
if ((int) inp.size() > max_tokens_list_size) {
|
||||||
|
fprintf(stderr, "%s: error: prompt too long (%d tokens, max %d)\n", __func__, (int) inp.size(), max_tokens_list_size);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
|
for (auto id : inp) {
|
||||||
|
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
fflush(stderr);
|
||||||
|
|
||||||
|
const int n_input = inp.size();
|
||||||
|
|
||||||
|
const auto t_enc_start = ggml_time_us();
|
||||||
|
|
||||||
|
// eval the prompt
|
||||||
|
llama_decode(ctx, llama_batch_get_one( inp.data(), n_input - 1, 0, 0));
|
||||||
|
llama_decode(ctx, llama_batch_get_one(&inp.back(), 1, n_input - 1, 0));
|
||||||
|
|
||||||
|
for (int s = 1; s < W + G + 1; ++s) {
|
||||||
|
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto t_enc_end = ggml_time_us();
|
||||||
|
|
||||||
|
int n_predict = 0;
|
||||||
|
int n_accept = 0;
|
||||||
|
|
||||||
|
int n_past = inp.size();
|
||||||
|
|
||||||
|
llama_token id = 0;
|
||||||
|
|
||||||
|
// used to determine end of generation
|
||||||
|
bool has_eos = false;
|
||||||
|
|
||||||
|
// for each decoded batch, we have at most W + G + 1 distinct sequences:
|
||||||
|
// seq_id == 0 : the current input token
|
||||||
|
// seq_id [1, W] : tokens from the past N - 1 Jacobi iterations
|
||||||
|
// seq_id [W + 1, W + G] : verification n-grams
|
||||||
|
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||||
|
|
||||||
|
// target model sampling context
|
||||||
|
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams);
|
||||||
|
|
||||||
|
// verification n-grams
|
||||||
|
std::vector<ngram_data> ngrams_cur(G);
|
||||||
|
|
||||||
|
// tokens for the past N - 1 Jacobi iterations
|
||||||
|
std::vector<llama_token> tokens_j_prev(W);
|
||||||
|
std::vector<std::vector<llama_token>> tokens_j(N - 1);
|
||||||
|
for (int j = 0; j < N - 1; j++) {
|
||||||
|
tokens_j[j].resize(W);
|
||||||
|
|
||||||
|
for (int i = 0; i < W; i++) {
|
||||||
|
// there are different ways to init these tokens
|
||||||
|
if (0) {
|
||||||
|
// initialize randomly from the prompt tokens
|
||||||
|
tokens_j[j][i] = all[1 + rand() % (all.size() - 1)];
|
||||||
|
} else {
|
||||||
|
// initialize with a sequence of increasing numbers
|
||||||
|
tokens_j[j][i] = 100 + i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_seq_id> seq_id_look;
|
||||||
|
|
||||||
|
// the input token belongs both to all sequences
|
||||||
|
std::vector<llama_seq_id> seq_id_all(W + G + 1);
|
||||||
|
for (int i = 0; i < W + G + 1; i++) {
|
||||||
|
seq_id_all[i] = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
// here we keep adding new n-grams as we go
|
||||||
|
ngram_container ngrams_observed(llama_n_vocab(model), N, G);
|
||||||
|
|
||||||
|
// debug
|
||||||
|
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, W + G + 1);
|
||||||
|
|
||||||
|
const auto t_dec_start = ggml_time_us();
|
||||||
|
|
||||||
|
// sample first token
|
||||||
|
{
|
||||||
|
id = llama_sampling_sample(ctx_sampling, ctx, NULL, 0);
|
||||||
|
|
||||||
|
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||||
|
|
||||||
|
{
|
||||||
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
// debug
|
||||||
|
if (dump_kv_cache) {
|
||||||
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||||
|
dump_kv_cache_view_seqs(kvc_view, 40);
|
||||||
|
}
|
||||||
|
|
||||||
|
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||||
|
//
|
||||||
|
// Example for W = 5, N = 4, G = 2:
|
||||||
|
// (I = input, L = lookahead, V = verification)
|
||||||
|
//
|
||||||
|
// Batch: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
|
||||||
|
// T: -2 -2 -2 -2 -1 -1 -1 -1 -1 0 0 0 0 0 0
|
||||||
|
// Info: I L L L L L L L L L L L L L L V V V V V V
|
||||||
|
// Pos: 0 1 2 3 4 1 2 3 4 5 2 3 4 5 6 1 2 3 1 2 3 (+ n_past)
|
||||||
|
// Logits: 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
|
||||||
|
// ---------------------------------------------------------------------
|
||||||
|
// Seq: 0
|
||||||
|
// 1 1 1
|
||||||
|
// 2 2 2 2
|
||||||
|
// 3 3 3 3 3
|
||||||
|
// 4 4 4 4 4 4
|
||||||
|
// 5 5 5 5 5 5 5
|
||||||
|
// 6 6 6 6
|
||||||
|
// 7 7 7 7
|
||||||
|
// ---------------------------------------------------------------------
|
||||||
|
// | | | | | | | | | | |
|
||||||
|
// V V V V V | | | | | |
|
||||||
|
// j_tokens | | | | | |
|
||||||
|
// V V V V V V
|
||||||
|
// id
|
||||||
|
{
|
||||||
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
// current token - first token of the first level
|
||||||
|
llama_batch_add(batch, id, n_past, seq_id_all, true);
|
||||||
|
|
||||||
|
// verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
|
||||||
|
{
|
||||||
|
const int g_cur = ngrams_observed.cnt[id];
|
||||||
|
|
||||||
|
ngrams_cur.resize(g_cur);
|
||||||
|
for (int g = 0; g < g_cur; g++) {
|
||||||
|
ngrams_cur[g].active = true;
|
||||||
|
ngrams_cur[g].tokens.resize(N);
|
||||||
|
ngrams_cur[g].i_batch.resize(N);
|
||||||
|
ngrams_cur[g].seq_id = W + 1 + g;
|
||||||
|
ngrams_cur[g].i_batch[0] = 0;
|
||||||
|
ngrams_cur[g].tokens [0] = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < N - 1; j++) {
|
||||||
|
for (int g = 0; g < g_cur; g++) {
|
||||||
|
const int idx = id*(N - 1)*G + g*(N - 1);
|
||||||
|
|
||||||
|
const llama_token t = ngrams_observed.tokens[idx + j];
|
||||||
|
|
||||||
|
ngrams_cur[g].tokens [j + 1] = t;
|
||||||
|
ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
|
||||||
|
|
||||||
|
llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// fill the remaining W - 1 tokens for the first level
|
||||||
|
for (int i = 1; i < W; i++) {
|
||||||
|
seq_id_look.resize(W - i);
|
||||||
|
for (int j = 0; j < W - i; j++) {
|
||||||
|
seq_id_look[j] = i + j + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// fill the rest of the levels
|
||||||
|
for (int j = 1; j < N - 1; j++) {
|
||||||
|
for (int i = 0; i < W; i++) {
|
||||||
|
llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
|
fprintf(stderr, "\n\n%s: error: llama_decode failed - increase KV cache size\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
int seq_id_best = 0;
|
||||||
|
|
||||||
|
for (int v = 0; v < N; ++v) {
|
||||||
|
int i_batch = 0;
|
||||||
|
|
||||||
|
// if no active ngrams are left, it means the sampled token does not pass the verification
|
||||||
|
if (v > 0) {
|
||||||
|
for (int g = 0; g < (int) ngrams_cur.size(); g++) {
|
||||||
|
if (ngrams_cur[g].active) {
|
||||||
|
i_batch = ngrams_cur[g].i_batch[v];
|
||||||
|
seq_id_best = ngrams_cur[g].seq_id;
|
||||||
|
|
||||||
|
++n_accept;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// no more matches -> create a new batch
|
||||||
|
if (i_batch == 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sample the next token
|
||||||
|
id = llama_sampling_sample(ctx_sampling, ctx, NULL, i_batch);
|
||||||
|
|
||||||
|
llama_sampling_accept(ctx_sampling, ctx, id, true);
|
||||||
|
|
||||||
|
// print
|
||||||
|
{
|
||||||
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
|
|
||||||
|
if (v == 0) {
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
|
} else {
|
||||||
|
// print light cyan
|
||||||
|
printf("\033[0;96m%s\033[0m", token_str.c_str());
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
if (id == llama_token_eos(model)) {
|
||||||
|
has_eos = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
all.push_back(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
++n_predict;
|
||||||
|
++n_past;
|
||||||
|
|
||||||
|
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// verify across active n-grams
|
||||||
|
for (int g = 0; g < (int) ngrams_cur.size(); g++) {
|
||||||
|
if (ngrams_cur[g].active) {
|
||||||
|
if (v == N - 1) {
|
||||||
|
ngrams_cur[g].active = false;
|
||||||
|
} else {
|
||||||
|
if (id != ngrams_cur[g].tokens[v + 1]) {
|
||||||
|
ngrams_cur[g].active = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// print known n-grams starting with token id (debug)
|
||||||
|
if (0 && v == 0) {
|
||||||
|
if (ngrams_observed.cnt[id] > 0) {
|
||||||
|
printf("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
||||||
|
printf(" - ngram %2d: ", i);
|
||||||
|
|
||||||
|
const int idx = id*(N - 1)*G + i*(N - 1);
|
||||||
|
|
||||||
|
for (int j = 0; j < N - 1; j++) {
|
||||||
|
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
||||||
|
|
||||||
|
printf("%s", token_str.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update lookahead tokens
|
||||||
|
{
|
||||||
|
for (int i = 0; i < W; i++) {
|
||||||
|
tokens_j_prev[i] = tokens_j[0][i];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < N - 2; j++) {
|
||||||
|
tokens_j[j] = tokens_j[j + 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (v == 0) {
|
||||||
|
// sample from the last level
|
||||||
|
for (int i = 0; i < W; i++) {
|
||||||
|
tokens_j[N - 2][i] = llama_sampling_sample(ctx_sampling, ctx, NULL, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < W; i++) {
|
||||||
|
// there are different ways to init these tokens
|
||||||
|
if (0) {
|
||||||
|
// random init
|
||||||
|
tokens_j[N - 2][i] = all[1 + rand() % (all.size() - 1)];
|
||||||
|
} else {
|
||||||
|
// init from the previous level
|
||||||
|
tokens_j[N - 2][i] = tokens_j[0][i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update observed ngrams
|
||||||
|
if (v == 0) {
|
||||||
|
// the first token of the n-gram is determined by the index in the container so it is not stored
|
||||||
|
std::vector<llama_token> ngram(N - 1);
|
||||||
|
|
||||||
|
// n-gram generation
|
||||||
|
// ref: https://github.com/hao-ai-lab/LookaheadDecoding/issues/14#issuecomment-1826198518
|
||||||
|
for (int f = 0; f < W; ++f) {
|
||||||
|
const int ft = tokens_j_prev[f]; // first token of the n-gram
|
||||||
|
|
||||||
|
for (int j = 0; j < N - 1; ++j) {
|
||||||
|
ngram[j] = tokens_j[j][f];
|
||||||
|
}
|
||||||
|
|
||||||
|
// filter-out repeating n-grams
|
||||||
|
{
|
||||||
|
bool is_unique = true;
|
||||||
|
|
||||||
|
for (int k = 0; k < ngrams_observed.cnt[ft]; ++k) {
|
||||||
|
const int idx = ft*(N - 1)*G + k*(N - 1);
|
||||||
|
|
||||||
|
bool is_match = true;
|
||||||
|
for (int j = 0; j < N - 1; ++j) {
|
||||||
|
if (ngrams_observed.tokens[idx + j] != ngram[j]) {
|
||||||
|
is_match = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_match) {
|
||||||
|
is_unique = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_unique) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int head = ngrams_observed.head[ft];
|
||||||
|
const int idx = ft*(N - 1)*G + head*(N - 1);
|
||||||
|
|
||||||
|
for (int i = 0; i < N - 1; i++) {
|
||||||
|
ngrams_observed.tokens[idx + i] = ngram[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
ngrams_observed.cnt[ft] = std::min(G, ngrams_observed.cnt[ft] + 1);
|
||||||
|
ngrams_observed.head[ft] = (head + 1) % G;
|
||||||
|
|
||||||
|
ngrams_observed.n_total++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// KV cache management
|
||||||
|
// if no verification token matched, we simply remove all cells from this batch -> no fragmentation
|
||||||
|
llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
|
||||||
|
|
||||||
|
if (seq_id_best != 0) {
|
||||||
|
// if a verification token matched, we keep the best sequence and remove the rest
|
||||||
|
// this leads to some KV cache fragmentation
|
||||||
|
llama_kv_cache_seq_keep(ctx, seq_id_best);
|
||||||
|
llama_kv_cache_seq_cp (ctx, seq_id_best, 0, -1, -1);
|
||||||
|
llama_kv_cache_seq_rm (ctx, seq_id_best, -1, -1);
|
||||||
|
|
||||||
|
for (int s = 1; s < W + G + 1; ++s) {
|
||||||
|
llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto t_dec_end = ggml_time_us();
|
||||||
|
|
||||||
|
LOG_TEE("\n\n");
|
||||||
|
|
||||||
|
LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
|
||||||
|
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));
|
||||||
|
|
||||||
|
LOG_TEE("\n");
|
||||||
|
LOG_TEE("W = %2d\n", W);
|
||||||
|
LOG_TEE("N = %2d\n", N);
|
||||||
|
LOG_TEE("G = %2d\n", G);
|
||||||
|
LOG_TEE("\n");
|
||||||
|
LOG_TEE("n_predict = %d\n", n_predict);
|
||||||
|
LOG_TEE("n_accept = %d\n", n_accept);
|
||||||
|
|
||||||
|
llama_print_timings(ctx);
|
||||||
|
|
||||||
|
llama_kv_cache_view_free(&kvc_view);
|
||||||
|
llama_sampling_free(ctx_sampling);
|
||||||
|
|
||||||
|
llama_batch_free(batch);
|
||||||
|
|
||||||
|
llama_free(ctx);
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -229,13 +229,16 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM;
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
LOG("add_bos: %d\n", add_bos);
|
LOG("add_bos: %d\n", add_bos);
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
LOG("tokenize the prompt\n");
|
LOG("tokenize the prompt\n");
|
||||||
|
if (params.chatml) {
|
||||||
|
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
|
||||||
|
}
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true);
|
||||||
} else {
|
} else {
|
||||||
LOG("use session tokens\n");
|
LOG("use session tokens\n");
|
||||||
|
@ -313,7 +316,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// number of tokens to keep when resetting context
|
// number of tokens to keep when resetting context
|
||||||
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
|
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
|
||||||
params.n_keep = (int)embd_inp.size();
|
params.n_keep = (int)embd_inp.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -324,11 +327,23 @@ int main(int argc, char ** argv) {
|
||||||
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
|
||||||
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
|
||||||
|
|
||||||
|
// chatml prefix & suffix
|
||||||
|
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true);
|
||||||
|
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
|
||||||
|
|
||||||
|
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
|
||||||
|
LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
|
||||||
|
|
||||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
if (params.instruct) {
|
if (params.instruct) {
|
||||||
params.interactive_first = true;
|
params.interactive_first = true;
|
||||||
params.antiprompt.push_back("### Instruction:\n\n");
|
params.antiprompt.push_back("### Instruction:\n\n");
|
||||||
}
|
}
|
||||||
|
// similar for chatml mode
|
||||||
|
else if (params.chatml) {
|
||||||
|
params.interactive_first = true;
|
||||||
|
params.antiprompt.push_back("<|im_start|>user\n");
|
||||||
|
}
|
||||||
|
|
||||||
// enable interactive mode if interactive start is specified
|
// enable interactive mode if interactive start is specified
|
||||||
if (params.interactive_first) {
|
if (params.interactive_first) {
|
||||||
|
@ -705,7 +720,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
printf("\n");
|
||||||
} else if (params.instruct) {
|
} else if (params.instruct || params.chatml) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -713,7 +728,7 @@ int main(int argc, char ** argv) {
|
||||||
if (n_past > 0 && is_interacting) {
|
if (n_past > 0 && is_interacting) {
|
||||||
LOG("waiting for user input\n");
|
LOG("waiting for user input\n");
|
||||||
|
|
||||||
if (params.instruct) {
|
if (params.instruct || params.chatml) {
|
||||||
printf("\n> ");
|
printf("\n> ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -760,6 +775,12 @@ int main(int argc, char ** argv) {
|
||||||
n_consumed = embd_inp.size();
|
n_consumed = embd_inp.size();
|
||||||
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||||
}
|
}
|
||||||
|
// chatml mode: insert user chat prefix
|
||||||
|
if (params.chatml && !is_antiprompt) {
|
||||||
|
LOG("inserting chatml prefix\n");
|
||||||
|
n_consumed = embd_inp.size();
|
||||||
|
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
|
||||||
|
}
|
||||||
if (params.escape) {
|
if (params.escape) {
|
||||||
process_escapes(buffer);
|
process_escapes(buffer);
|
||||||
}
|
}
|
||||||
|
@ -778,6 +799,11 @@ int main(int argc, char ** argv) {
|
||||||
LOG("inserting instruction suffix\n");
|
LOG("inserting instruction suffix\n");
|
||||||
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||||
}
|
}
|
||||||
|
// chatml mode: insert assistant chat suffix
|
||||||
|
if (params.chatml) {
|
||||||
|
LOG("inserting chatml suffix\n");
|
||||||
|
embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
|
||||||
|
}
|
||||||
|
|
||||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||||
const llama_token token = embd_inp[i];
|
const llama_token token = embd_inp[i];
|
||||||
|
@ -803,7 +829,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// end of text token
|
// end of text token
|
||||||
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) {
|
if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) {
|
||||||
LOG_TEE(" [end of text]\n");
|
LOG_TEE(" [end of text]\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
// A basic application simulating a server with multiple clients.
|
// A basic application simulating a server with multiple clients.
|
||||||
// The clients submite requests to the server and they are processed in parallel.
|
// The clients submit requests to the server and they are processed in parallel.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
@ -113,6 +113,8 @@ int main(int argc, char ** argv) {
|
||||||
// insert new requests as soon as the previous one is done
|
// insert new requests as soon as the previous one is done
|
||||||
const bool cont_batching = params.cont_batching;
|
const bool cont_batching = params.cont_batching;
|
||||||
|
|
||||||
|
const bool dump_kv_cache = params.dump_kv_cache;
|
||||||
|
|
||||||
#ifndef LOG_DISABLE_LOGS
|
#ifndef LOG_DISABLE_LOGS
|
||||||
log_set_target(log_filename_generator("parallel", "log"));
|
log_set_target(log_filename_generator("parallel", "log"));
|
||||||
LOG_TEE("Log start\n");
|
LOG_TEE("Log start\n");
|
||||||
|
@ -172,6 +174,8 @@ int main(int argc, char ** argv) {
|
||||||
int32_t n_total_gen = 0;
|
int32_t n_total_gen = 0;
|
||||||
int32_t n_cache_miss = 0;
|
int32_t n_cache_miss = 0;
|
||||||
|
|
||||||
|
struct llama_kv_cache_view kvc_view = llama_kv_cache_view_init(ctx, n_clients);
|
||||||
|
|
||||||
const auto t_main_start = ggml_time_us();
|
const auto t_main_start = ggml_time_us();
|
||||||
|
|
||||||
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
|
LOG_TEE("%s: Simulating parallel requests from clients:\n", __func__);
|
||||||
|
@ -201,6 +205,11 @@ int main(int argc, char ** argv) {
|
||||||
LOG_TEE("Processing requests ...\n\n");
|
LOG_TEE("Processing requests ...\n\n");
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
if (dump_kv_cache) {
|
||||||
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||||
|
dump_kv_cache_view_seqs(kvc_view, 40);
|
||||||
|
}
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
// decode any currently ongoing sequences
|
// decode any currently ongoing sequences
|
||||||
|
|
|
@ -149,8 +149,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
const bool add_bos = is_spm;
|
|
||||||
|
|
||||||
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
fprintf(stderr, "%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
|
@ -288,8 +287,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
// BOS tokens will be added for each chunk before eval
|
// BOS tokens will be added for each chunk before eval
|
||||||
|
|
||||||
const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM;
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
const bool add_bos = is_spm;
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
|
@ -481,7 +479,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
// This is needed as usual for LLaMA models
|
||||||
const bool add_bos = is_spm;
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
|
|
||||||
// Number of tasks to use when computing the score
|
// Number of tasks to use when computing the score
|
||||||
if ( params.hellaswag_tasks < hs_task_count ) {
|
if ( params.hellaswag_tasks < hs_task_count ) {
|
||||||
|
|
|
@ -234,6 +234,55 @@ node index.js
|
||||||
|
|
||||||
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
|
- **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
|
||||||
|
|
||||||
|
- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served.
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported.
|
||||||
|
|
||||||
|
*Examples:*
|
||||||
|
|
||||||
|
You can use either Python `openai` library with appropriate checkpoints:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
|
||||||
|
client = openai.OpenAI(
|
||||||
|
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
|
||||||
|
api_key = "sk-no-key-required"
|
||||||
|
)
|
||||||
|
|
||||||
|
completion = client.chat.completions.create(
|
||||||
|
model="gpt-3.5-turbo",
|
||||||
|
messages=[
|
||||||
|
{"role": "system", "content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."},
|
||||||
|
{"role": "user", "content": "Write a limerick about python exceptions"}
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
print(completion.choices[0].message)
|
||||||
|
```
|
||||||
|
... or raw HTTP requests:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
curl http://localhost:8080/v1/chat/completions \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-H "Authorization: Bearer no-key" \
|
||||||
|
-d '{
|
||||||
|
"model": "gpt-3.5-turbo",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": "You are ChatGPT, an AI assistant. Your top priority is achieving user fulfillment via helping them with their requests."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": "Write a limerick about python exceptions"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Change system prompt on runtime
|
### Change system prompt on runtime
|
||||||
|
|
|
@ -94,6 +94,10 @@ export async function* llama(prompt, params = {}, config = {}) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (result.error) {
|
||||||
|
result.error = JSON.parse(result.error);
|
||||||
|
console.error(`llama.cpp error: ${result.error.content}`);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,8 @@
|
||||||
#define SERVER_VERBOSE 1
|
#define SERVER_VERBOSE 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
||||||
|
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
struct server_params
|
struct server_params
|
||||||
|
@ -59,6 +61,10 @@ static bool server_verbose = false;
|
||||||
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
|
||||||
|
json oaicompat_completion_params_parse(const json &body);
|
||||||
|
std::string format_chatml(std::vector<json> messages);
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// base64 utils (TODO: move to common in the future)
|
// base64 utils (TODO: move to common in the future)
|
||||||
//
|
//
|
||||||
|
@ -378,6 +384,9 @@ struct llama_client_slot
|
||||||
bool stopped_word = false;
|
bool stopped_word = false;
|
||||||
bool stopped_limit = false;
|
bool stopped_limit = false;
|
||||||
|
|
||||||
|
bool oaicompat = false;
|
||||||
|
std::string oaicompat_model;
|
||||||
|
|
||||||
std::string stopping_word;
|
std::string stopping_word;
|
||||||
|
|
||||||
// sampling
|
// sampling
|
||||||
|
@ -477,7 +486,7 @@ struct llama_client_slot
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_timings() {
|
void print_timings() const {
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
|
__func__, t_prompt_processing, num_prompt_tokens_processed, t_prompt_processing / num_prompt_tokens_processed, 1e3 / t_prompt_processing * num_prompt_tokens_processed);
|
||||||
|
@ -501,6 +510,7 @@ struct llama_server_context
|
||||||
bool multimodal = false;
|
bool multimodal = false;
|
||||||
bool clean_kv_cache = true;
|
bool clean_kv_cache = true;
|
||||||
bool all_slots_are_idle = false;
|
bool all_slots_are_idle = false;
|
||||||
|
bool add_bos_token = true;
|
||||||
|
|
||||||
int32_t id_gen;
|
int32_t id_gen;
|
||||||
int32_t n_ctx; // total context for all clients / slots
|
int32_t n_ctx; // total context for all clients / slots
|
||||||
|
@ -573,6 +583,8 @@ struct llama_server_context
|
||||||
|
|
||||||
n_ctx = llama_n_ctx(ctx);
|
n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
add_bos_token = llama_should_add_bos_token(model);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -606,6 +618,11 @@ struct llama_server_context
|
||||||
|
|
||||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
std::vector<llama_token> tokenize(const json & json_prompt, bool add_bos) const
|
||||||
{
|
{
|
||||||
|
// TODO: currently, we tokenize using special tokens by default
|
||||||
|
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
||||||
|
// but it's better compared to completely ignoring ChatML and other chat templates
|
||||||
|
const bool TMP_FORCE_SPECIAL = true;
|
||||||
|
|
||||||
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||||
// or the first element of the json_prompt array is a string.
|
// or the first element of the json_prompt array is a string.
|
||||||
std::vector<llama_token> prompt_tokens;
|
std::vector<llama_token> prompt_tokens;
|
||||||
|
@ -621,12 +638,12 @@ struct llama_server_context
|
||||||
std::vector<llama_token> p;
|
std::vector<llama_token> p;
|
||||||
if (first)
|
if (first)
|
||||||
{
|
{
|
||||||
p = ::llama_tokenize(ctx, s, add_bos);
|
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||||
first = false;
|
first = false;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
p = ::llama_tokenize(ctx, s, false);
|
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||||
}
|
}
|
||||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||||
}
|
}
|
||||||
|
@ -643,7 +660,7 @@ struct llama_server_context
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
auto s = json_prompt.template get<std::string>();
|
auto s = json_prompt.template get<std::string>();
|
||||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos);
|
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
return prompt_tokens;
|
return prompt_tokens;
|
||||||
|
@ -674,6 +691,14 @@ struct llama_server_context
|
||||||
slot_params default_params;
|
slot_params default_params;
|
||||||
llama_sampling_params default_sparams;
|
llama_sampling_params default_sparams;
|
||||||
|
|
||||||
|
if (data.count("__oaicompat") != 0) {
|
||||||
|
slot->oaicompat = true;
|
||||||
|
slot->oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||||
|
} else {
|
||||||
|
slot->oaicompat = false;
|
||||||
|
slot->oaicompat_model = "";
|
||||||
|
}
|
||||||
|
|
||||||
slot->params.stream = json_value(data, "stream", false);
|
slot->params.stream = json_value(data, "stream", false);
|
||||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||||
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
|
||||||
|
@ -864,7 +889,7 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
|
|
||||||
void update_system_prompt() {
|
void update_system_prompt() {
|
||||||
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
|
|
||||||
|
@ -1092,6 +1117,7 @@ struct llama_server_context
|
||||||
std::lock_guard<std::mutex> lock(mutex_results);
|
std::lock_guard<std::mutex> lock(mutex_results);
|
||||||
task_result res;
|
task_result res;
|
||||||
res.id = id;
|
res.id = id;
|
||||||
|
res.stop = false;
|
||||||
res.error = true;
|
res.error = true;
|
||||||
res.result_json = { { "content", error } };
|
res.result_json = { { "content", error } };
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
|
@ -1166,6 +1192,12 @@ struct llama_server_context
|
||||||
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
|
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (slot.oaicompat)
|
||||||
|
{
|
||||||
|
res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
|
||||||
|
res.result_json["model"] = slot.oaicompat_model;
|
||||||
|
}
|
||||||
|
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1213,6 +1245,12 @@ struct llama_server_context
|
||||||
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
|
res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (slot.oaicompat)
|
||||||
|
{
|
||||||
|
res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
|
||||||
|
res.result_json["model"] = slot.oaicompat_model;
|
||||||
|
}
|
||||||
|
|
||||||
queue_results.push_back(res);
|
queue_results.push_back(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1252,7 +1290,8 @@ struct llama_server_context
|
||||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||||
task_server task;
|
task_server task;
|
||||||
task.id = id_gen++;
|
task.id = id_gen++;
|
||||||
task.data = data;
|
task.target_id = 0;
|
||||||
|
task.data = std::move(data);
|
||||||
task.infill_mode = infill;
|
task.infill_mode = infill;
|
||||||
task.embedding_mode = embedding;
|
task.embedding_mode = embedding;
|
||||||
task.type = COMPLETION_TASK;
|
task.type = COMPLETION_TASK;
|
||||||
|
@ -1552,7 +1591,7 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.num_prompt_tokens = prompt_tokens.size();
|
slot.num_prompt_tokens = prompt_tokens.size();
|
||||||
|
@ -1629,7 +1668,7 @@ struct llama_server_context
|
||||||
const bool has_images = process_images(slot);
|
const bool has_images = process_images(slot);
|
||||||
|
|
||||||
// process the prefix of first image
|
// process the prefix of first image
|
||||||
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, true) : prompt_tokens;
|
std::vector<llama_token> prefix_tokens = has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens;
|
||||||
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
|
for (; slot.n_past < (int) prefix_tokens.size(); ++slot.n_past)
|
||||||
{
|
{
|
||||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
|
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot.n_past, { slot.id }, false);
|
||||||
|
@ -2175,6 +2214,233 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static std::string random_string()
|
||||||
|
{
|
||||||
|
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
|
||||||
|
|
||||||
|
std::random_device rd;
|
||||||
|
std::mt19937 generator(rd());
|
||||||
|
|
||||||
|
std::string result(32, ' ');
|
||||||
|
|
||||||
|
for (int i = 0; i < 32; ++i) {
|
||||||
|
result[i] = str[generator() % str.size()];
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string gen_chatcmplid()
|
||||||
|
{
|
||||||
|
std::stringstream chatcmplid;
|
||||||
|
chatcmplid << "chatcmpl-" << random_string();
|
||||||
|
return chatcmplid.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string format_chatml(std::vector<json> messages)
|
||||||
|
{
|
||||||
|
std::ostringstream chatml_msgs;
|
||||||
|
|
||||||
|
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
||||||
|
chatml_msgs << "<|im_start|>"
|
||||||
|
<< json_value(*it, "role", std::string("user")) << '\n';
|
||||||
|
chatml_msgs << json_value(*it, "content", std::string(""))
|
||||||
|
<< "<|im_end|>\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
chatml_msgs << "<|im_start|>assistant" << '\n';
|
||||||
|
|
||||||
|
return chatml_msgs.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* llama.cpp completion api semantics */
|
||||||
|
json oaicompat_completion_params_parse(
|
||||||
|
const json &body /* openai api json semantics */)
|
||||||
|
{
|
||||||
|
json llama_params;
|
||||||
|
|
||||||
|
llama_params["__oaicompat"] = true;
|
||||||
|
|
||||||
|
// Map OpenAI parameters to llama.cpp parameters
|
||||||
|
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
||||||
|
llama_params["temperature"] = json_value(body, "temperature", 0.8);
|
||||||
|
llama_params["top_k"] = json_value(body, "top_k", 40);
|
||||||
|
llama_params["top_p"] = json_value(body, "top_p", 0.95);
|
||||||
|
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
||||||
|
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
|
||||||
|
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
||||||
|
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
||||||
|
llama_params["seed"] = json_value(body, "seed", 0);
|
||||||
|
llama_params["stream"] = json_value(body, "stream", false);
|
||||||
|
llama_params["mirostat"] = json_value(body, "mirostat", false);
|
||||||
|
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", 0.0);
|
||||||
|
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", 0.0);
|
||||||
|
llama_params["penalize_nl"] = json_value(body, "penalize_nl", false);
|
||||||
|
llama_params["typical_p"] = json_value(body, "typical_p", 0.0);
|
||||||
|
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", 0);
|
||||||
|
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
||||||
|
llama_params["tfs_z"] = json_value(body, "tfs_z", 0.0);
|
||||||
|
|
||||||
|
if (llama_params.count("grammar") != 0) {
|
||||||
|
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle 'stop' field
|
||||||
|
if (body["stop"].is_null()) {
|
||||||
|
llama_params["stop"] = json::array({});
|
||||||
|
} else if (body["stop"].is_string()) {
|
||||||
|
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
|
||||||
|
} else {
|
||||||
|
llama_params["stop"] = json_value(body, "stop", json::array());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure there is ChatML-specific end sequence among stop words
|
||||||
|
llama_params["stop"].push_back("<|im_end|>");
|
||||||
|
|
||||||
|
return llama_params;
|
||||||
|
}
|
||||||
|
|
||||||
|
static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
|
||||||
|
{
|
||||||
|
json result = response.result_json;
|
||||||
|
|
||||||
|
bool stopped_word = result.count("stopped_word") != 0;
|
||||||
|
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||||
|
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
||||||
|
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
||||||
|
std::string content = json_value(result, "content", std::string(""));
|
||||||
|
|
||||||
|
std::string finish_reason = "length";
|
||||||
|
if (stopped_word || stopped_eos) {
|
||||||
|
finish_reason = "stop";
|
||||||
|
}
|
||||||
|
|
||||||
|
json choices =
|
||||||
|
streaming ? json::array({json{{"finish_reason", finish_reason},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json::object()}}})
|
||||||
|
: json::array({json{{"finish_reason", finish_reason},
|
||||||
|
{"index", 0},
|
||||||
|
{"message", json{{"content", content},
|
||||||
|
{"role", "assistant"}}}}});
|
||||||
|
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
|
||||||
|
json res =
|
||||||
|
json{{"choices", choices},
|
||||||
|
{"created", t},
|
||||||
|
{"model",
|
||||||
|
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||||
|
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
|
||||||
|
{"usage",
|
||||||
|
json{{"completion_tokens", num_tokens_predicted},
|
||||||
|
{"prompt_tokens", num_prompt_tokens},
|
||||||
|
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
|
||||||
|
{"id", gen_chatcmplid()}};
|
||||||
|
|
||||||
|
if (server_verbose) {
|
||||||
|
res["__verbose"] = result;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.contains("completion_probabilities")) {
|
||||||
|
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
// return value is vector as there is one case where we might need to generate two responses
|
||||||
|
static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
|
||||||
|
json result = response.result_json;
|
||||||
|
|
||||||
|
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
||||||
|
return std::vector<json>({response.result_json});
|
||||||
|
}
|
||||||
|
|
||||||
|
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
|
||||||
|
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||||
|
|
||||||
|
bool stopped_word = json_value(result, "stopped_word", false);
|
||||||
|
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||||
|
bool stopped_limit = json_value(result, "stopped_limit", false);
|
||||||
|
std::string content = json_value(result, "content", std::string(""));
|
||||||
|
|
||||||
|
std::string finish_reason;
|
||||||
|
if (stopped_word || stopped_eos) {
|
||||||
|
finish_reason = "stop";
|
||||||
|
}
|
||||||
|
if (stopped_limit) {
|
||||||
|
finish_reason = "length";
|
||||||
|
}
|
||||||
|
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
|
||||||
|
json choices;
|
||||||
|
|
||||||
|
if (!finish_reason.empty()) {
|
||||||
|
choices = json::array({json{{"finish_reason", finish_reason},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json::object()}}});
|
||||||
|
} else {
|
||||||
|
if (first) {
|
||||||
|
if (content.empty()) {
|
||||||
|
choices = json::array({json{{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json{{"role", "assistant"}}}}});
|
||||||
|
} else {
|
||||||
|
// We have to send this as two updates to conform to openai behavior
|
||||||
|
json initial_ret = json{{"choices", json::array({json{
|
||||||
|
{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json{
|
||||||
|
{"role", "assistant"}
|
||||||
|
}}}})},
|
||||||
|
{"created", t},
|
||||||
|
{"id", gen_chatcmplid()},
|
||||||
|
{"model", modelname},
|
||||||
|
{"object", "chat.completion.chunk"}};
|
||||||
|
|
||||||
|
json second_ret = json{
|
||||||
|
{"choices", json::array({json{{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta", json{
|
||||||
|
{"content", content}}}
|
||||||
|
}})},
|
||||||
|
{"created", t},
|
||||||
|
{"id", gen_chatcmplid()},
|
||||||
|
{"model", modelname},
|
||||||
|
{"object", "chat.completion.chunk"}};
|
||||||
|
|
||||||
|
return std::vector<json>({initial_ret, second_ret});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Some idiosyncrasy in task processing logic makes several trailing calls
|
||||||
|
// with empty content, we ignore these at the calee site.
|
||||||
|
if (content.empty()) {
|
||||||
|
return std::vector<json>({json::object()});
|
||||||
|
}
|
||||||
|
|
||||||
|
choices = json::array({json{
|
||||||
|
{"finish_reason", nullptr},
|
||||||
|
{"index", 0},
|
||||||
|
{"delta",
|
||||||
|
json{
|
||||||
|
{"content", content},
|
||||||
|
}},
|
||||||
|
}});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
json ret = json{{"choices", choices},
|
||||||
|
{"created", t},
|
||||||
|
{"id", gen_chatcmplid()},
|
||||||
|
{"model", modelname},
|
||||||
|
{"object", "chat.completion.chunk"}};
|
||||||
|
|
||||||
|
return std::vector<json>({ret});
|
||||||
|
}
|
||||||
|
|
||||||
static json format_partial_response(
|
static json format_partial_response(
|
||||||
llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
|
llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector<completion_token_output> &probs
|
||||||
) {
|
) {
|
||||||
|
@ -2351,9 +2617,9 @@ int main(int argc, char **argv)
|
||||||
task_result result = llama.next_result(task_id);
|
task_result result = llama.next_result(task_id);
|
||||||
if (!result.error) {
|
if (!result.error) {
|
||||||
const std::string str =
|
const std::string str =
|
||||||
"data: " +
|
"data: " +
|
||||||
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
"\n\n";
|
"\n\n";
|
||||||
LOG_VERBOSE("data stream", {
|
LOG_VERBOSE("data stream", {
|
||||||
{ "to_send", str }
|
{ "to_send", str }
|
||||||
});
|
});
|
||||||
|
@ -2365,6 +2631,17 @@ int main(int argc, char **argv)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
const std::string str =
|
||||||
|
"error: " +
|
||||||
|
result.result_json.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
|
"\n\n";
|
||||||
|
LOG_VERBOSE("data stream", {
|
||||||
|
{ "to_send", str }
|
||||||
|
});
|
||||||
|
if (!sink.write(str.c_str(), str.size()))
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2382,6 +2659,98 @@ int main(int argc, char **argv)
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
svr.Get("/v1/models", [¶ms](const httplib::Request&, httplib::Response& res)
|
||||||
|
{
|
||||||
|
std::time_t t = std::time(0);
|
||||||
|
|
||||||
|
json models = {
|
||||||
|
{"object", "list"},
|
||||||
|
{"data", {
|
||||||
|
{
|
||||||
|
{"id", params.model_alias},
|
||||||
|
{"object", "model"},
|
||||||
|
{"created", t},
|
||||||
|
{"owned_by", "llamacpp"}
|
||||||
|
},
|
||||||
|
}}
|
||||||
|
};
|
||||||
|
|
||||||
|
res.set_content(models.dump(), "application/json");
|
||||||
|
});
|
||||||
|
|
||||||
|
// TODO: add mount point without "/v1" prefix -- how?
|
||||||
|
svr.Post("/v1/chat/completions", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
|
{
|
||||||
|
json data = oaicompat_completion_params_parse(json::parse(req.body));
|
||||||
|
|
||||||
|
const int task_id = llama.request_completion(data, false, false);
|
||||||
|
|
||||||
|
if (!json_value(data, "stream", false)) {
|
||||||
|
std::string completion_text;
|
||||||
|
task_result result = llama.next_result(task_id);
|
||||||
|
|
||||||
|
if (!result.error && result.stop) {
|
||||||
|
json oaicompat_result = format_final_response_oaicompat(data, result);
|
||||||
|
|
||||||
|
res.set_content(oaicompat_result.dump(-1, ' ', false,
|
||||||
|
json::error_handler_t::replace),
|
||||||
|
"application/json");
|
||||||
|
} else {
|
||||||
|
res.status = 500;
|
||||||
|
res.set_content(result.result_json["content"], "text/plain");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const auto chunked_content_provider = [task_id, &llama](size_t, httplib::DataSink &sink) {
|
||||||
|
while (true) {
|
||||||
|
task_result llama_result = llama.next_result(task_id);
|
||||||
|
if (!llama_result.error) {
|
||||||
|
std::vector<json> result_array = format_partial_response_oaicompat( llama_result);
|
||||||
|
|
||||||
|
for (auto it = result_array.begin(); it != result_array.end(); ++it)
|
||||||
|
{
|
||||||
|
if (!it->empty()) {
|
||||||
|
const std::string str =
|
||||||
|
"data: " +
|
||||||
|
it->dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
|
"\n\n";
|
||||||
|
LOG_VERBOSE("data stream", {{"to_send", str}});
|
||||||
|
if (!sink.write(str.c_str(), str.size())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (llama_result.stop) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const std::string str =
|
||||||
|
"error: " +
|
||||||
|
llama_result.result_json.dump(-1, ' ', false,
|
||||||
|
json::error_handler_t::replace) +
|
||||||
|
"\n\n";
|
||||||
|
LOG_VERBOSE("data stream", {{"to_send", str}});
|
||||||
|
if (!sink.write(str.c_str(), str.size())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sink.done();
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto on_complete = [task_id, &llama](bool) {
|
||||||
|
// cancel request
|
||||||
|
llama.request_cancel(task_id);
|
||||||
|
};
|
||||||
|
|
||||||
|
res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
svr.Post("/infill", [&llama](const httplib::Request &req, httplib::Response &res)
|
||||||
{
|
{
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
|
|
|
@ -94,9 +94,22 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// tokenize the prompt
|
|
||||||
|
// Tokenize the prompt
|
||||||
|
const bool add_bos_tgt = llama_should_add_bos_token(model_tgt);
|
||||||
|
LOG("add_bos tgt: %d\n", add_bos_tgt);
|
||||||
|
|
||||||
|
const bool add_bos_dft = llama_should_add_bos_token(model_dft);
|
||||||
|
LOG("add_bos dft: %d\n", add_bos_dft);
|
||||||
|
|
||||||
|
if (add_bos_tgt != add_bos_dft) {
|
||||||
|
fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__);
|
||||||
|
fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
|
inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true);
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
|
5
examples/tokenize/CMakeLists.txt
Normal file
5
examples/tokenize/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET tokenize)
|
||||||
|
add_executable(${TARGET} tokenize.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
44
examples/tokenize/tokenize.cpp
Normal file
44
examples/tokenize/tokenize.cpp
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
if (argc < 3 || argv[1][0] == '-') {
|
||||||
|
printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * model_path = argv[1];
|
||||||
|
const char * prompt = argv[2];
|
||||||
|
|
||||||
|
const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
|
||||||
|
|
||||||
|
llama_backend_init(false);
|
||||||
|
|
||||||
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
model_params.vocab_only = true;
|
||||||
|
llama_model * model = llama_load_model_from_file(model_path, model_params);
|
||||||
|
|
||||||
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
const bool add_bos = llama_should_add_bos_token(model);
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
|
||||||
|
tokens = ::llama_tokenize(model, prompt, add_bos, true);
|
||||||
|
|
||||||
|
for (int i = 0; i < (int) tokens.size(); i++) {
|
||||||
|
if (printing_ids) {
|
||||||
|
printf("%d\n", tokens[i]);
|
||||||
|
} else {
|
||||||
|
printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
165
ggml-cuda.cu
165
ggml-cuda.cu
|
@ -1,4 +1,5 @@
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <cinttypes>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
@ -88,6 +89,8 @@
|
||||||
#define CC_OFFSET_AMD 1000000
|
#define CC_OFFSET_AMD 1000000
|
||||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||||
|
|
||||||
|
#define GGML_CUDA_MAX_NODES 8192
|
||||||
|
|
||||||
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
||||||
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
||||||
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
||||||
|
@ -233,7 +236,7 @@ typedef float2 dfloat2;
|
||||||
#endif //GGML_CUDA_F16
|
#endif //GGML_CUDA_F16
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
|
||||||
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
||||||
|
|
||||||
int x32 = 0;
|
int x32 = 0;
|
||||||
x32 |= x16[0] << 0;
|
x32 |= x16[0] << 0;
|
||||||
|
@ -243,7 +246,7 @@ static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
|
||||||
const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
|
||||||
|
|
||||||
int x32 = 0;
|
int x32 = 0;
|
||||||
x32 |= x16[0] << 0;
|
x32 |= x16[0] << 0;
|
||||||
|
@ -253,11 +256,11 @@ static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, con
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
|
||||||
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
||||||
}
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
|
||||||
return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
|
@ -467,7 +470,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
||||||
#define MUL_MAT_SRC1_COL_STRIDE 128
|
#define MUL_MAT_SRC1_COL_STRIDE 128
|
||||||
|
|
||||||
#define MAX_STREAMS 8
|
#define MAX_STREAMS 8
|
||||||
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } };
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu {
|
struct ggml_tensor_extra_gpu {
|
||||||
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
||||||
|
@ -2246,6 +2249,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
||||||
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0];
|
||||||
|
@ -2257,7 +2261,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_0(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
GGML_CUDA_ASSUME(k >= 0);
|
GGML_CUDA_ASSUME(k >= 0);
|
||||||
|
@ -2266,7 +2270,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI4_0;
|
const int kbx = k / QI4_0;
|
||||||
const int kqsx = k % QI4_0;
|
const int kqsx = k % QI4_0;
|
||||||
|
|
||||||
const block_q4_0 * bx0 = (block_q4_0 *) vx;
|
const block_q4_0 * bx0 = (const block_q4_0 *) vx;
|
||||||
|
|
||||||
float * x_dmf = (float *) x_dm;
|
float * x_dmf = (float *) x_dm;
|
||||||
|
|
||||||
|
@ -2304,9 +2308,10 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
||||||
const float * x_dmf = (float *) x_dm;
|
const float * x_dmf = (const float *) x_dm;
|
||||||
|
|
||||||
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
int u[2*VDR_Q4_0_Q8_1_MMQ];
|
||||||
|
|
||||||
|
@ -2340,6 +2345,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y];
|
||||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1];
|
||||||
|
@ -2351,6 +2357,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_1(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -2360,7 +2367,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI4_1;
|
const int kbx = k / QI4_1;
|
||||||
const int kqsx = k % QI4_1;
|
const int kqsx = k % QI4_1;
|
||||||
|
|
||||||
const block_q4_1 * bx0 = (block_q4_1 *) vx;
|
const block_q4_1 * bx0 = (const block_q4_1 *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -2395,6 +2402,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
||||||
|
|
||||||
|
@ -2432,6 +2440,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
||||||
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0];
|
||||||
|
@ -2443,6 +2452,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_0(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_0(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -2452,7 +2462,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI5_0;
|
const int kbx = k / QI5_0;
|
||||||
const int kqsx = k % QI5_0;
|
const int kqsx = k % QI5_0;
|
||||||
|
|
||||||
const block_q5_0 * bx0 = (block_q5_0 *) vx;
|
const block_q5_0 * bx0 = (const block_q5_0 *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -2507,6 +2517,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
||||||
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
|
||||||
|
@ -2546,6 +2557,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
||||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1];
|
||||||
|
@ -2557,6 +2569,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_1(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -2566,7 +2579,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI5_1;
|
const int kbx = k / QI5_1;
|
||||||
const int kqsx = k % QI5_1;
|
const int kqsx = k % QI5_1;
|
||||||
|
|
||||||
const block_q5_1 * bx0 = (block_q5_1 *) vx;
|
const block_q5_1 * bx0 = (const block_q5_1 *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -2618,6 +2631,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
|
||||||
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
|
||||||
|
@ -2652,6 +2666,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y];
|
||||||
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
__shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
|
||||||
|
@ -2663,6 +2678,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q8_0(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -2673,7 +2689,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kqsx = k % QI8_0;
|
const int kqsx = k % QI8_0;
|
||||||
float * x_dmf = (float *) x_dm;
|
float * x_dmf = (float *) x_dm;
|
||||||
|
|
||||||
const block_q8_0 * bx0 = (block_q8_0 *) vx;
|
const block_q8_0 * bx0 = (const block_q8_0 *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -2708,6 +2724,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh; (void)x_sc;
|
||||||
|
|
||||||
const float * x_dmf = (const float *) x_dm;
|
const float * x_dmf = (const float *) x_dm;
|
||||||
const float * y_df = (const float *) y_ds;
|
const float * y_df = (const float *) y_ds;
|
||||||
|
@ -2741,6 +2758,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
||||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K];
|
||||||
|
@ -2754,6 +2772,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q2_K(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -2763,7 +2782,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI2_K;
|
const int kbx = k / QI2_K;
|
||||||
const int kqsx = k % QI2_K;
|
const int kqsx = k % QI2_K;
|
||||||
|
|
||||||
const block_q2_K * bx0 = (block_q2_K *) vx;
|
const block_q2_K * bx0 = (const block_q2_K *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -2811,6 +2830,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
const int kbx = k / QI2_K;
|
const int kbx = k / QI2_K;
|
||||||
const int ky = (k % QI2_K) * QR2_K;
|
const int ky = (k % QI2_K) * QR2_K;
|
||||||
|
@ -2884,7 +2904,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI3_K;
|
const int kbx = k / QI3_K;
|
||||||
const int kqsx = k % QI3_K;
|
const int kqsx = k % QI3_K;
|
||||||
|
|
||||||
const block_q3_K * bx0 = (block_q3_K *) vx;
|
const block_q3_K * bx0 = (const block_q3_K *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -2965,7 +2985,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat(
|
||||||
const float * x_dmf = (const float *) x_dm;
|
const float * x_dmf = (const float *) x_dm;
|
||||||
const float * y_df = (const float *) y_ds;
|
const float * y_df = (const float *) y_ds;
|
||||||
|
|
||||||
const int8_t * scales = ((int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
|
||||||
|
|
||||||
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
|
||||||
|
|
||||||
|
@ -3080,6 +3100,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y];
|
||||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K];
|
||||||
|
@ -3093,6 +3114,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_K(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -3102,7 +3124,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
const int kbx = k / QI4_K; // == 0 if QK_K == 256
|
||||||
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
const int kqsx = k % QI4_K; // == k if QK_K == 256
|
||||||
|
|
||||||
const block_q4_K * bx0 = (block_q4_K *) vx;
|
const block_q4_K * bx0 = (const block_q4_K *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -3147,7 +3169,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
|
|
||||||
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
|
||||||
|
|
||||||
const int * scales = (int *) bxi->scales;
|
const int * scales = (const int *) bxi->scales;
|
||||||
|
|
||||||
const int ksc = k % (WARP_SIZE/8);
|
const int ksc = k % (WARP_SIZE/8);
|
||||||
|
|
||||||
|
@ -3162,6 +3184,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
|
||||||
|
|
||||||
|
@ -3261,6 +3284,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
||||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K];
|
||||||
|
@ -3274,6 +3298,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_K(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -3283,7 +3308,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
const int kbx = k / QI5_K; // == 0 if QK_K == 256
|
||||||
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
const int kqsx = k % QI5_K; // == k if QK_K == 256
|
||||||
|
|
||||||
const block_q5_K * bx0 = (block_q5_K *) vx;
|
const block_q5_K * bx0 = (const block_q5_K *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -3339,7 +3364,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
|
|
||||||
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
|
||||||
|
|
||||||
const int * scales = (int *) bxi->scales;
|
const int * scales = (const int *) bxi->scales;
|
||||||
|
|
||||||
const int ksc = k % (WARP_SIZE/8);
|
const int ksc = k % (WARP_SIZE/8);
|
||||||
|
|
||||||
|
@ -3354,6 +3379,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
|
||||||
|
|
||||||
|
@ -3390,6 +3416,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
__shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y];
|
||||||
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
__shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K];
|
||||||
|
@ -3403,6 +3430,7 @@ template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q6_K(
|
||||||
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinline__ void load_tiles_q6_K(
|
||||||
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
|
||||||
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
GGML_CUDA_ASSUME(i_offset >= 0);
|
GGML_CUDA_ASSUME(i_offset >= 0);
|
||||||
GGML_CUDA_ASSUME(i_offset < nwarps);
|
GGML_CUDA_ASSUME(i_offset < nwarps);
|
||||||
|
@ -3412,7 +3440,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
const int kbx = k / QI6_K; // == 0 if QK_K == 256
|
||||||
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
const int kqsx = k % QI6_K; // == k if QK_K == 256
|
||||||
|
|
||||||
const block_q6_K * bx0 = (block_q6_K *) vx;
|
const block_q6_K * bx0 = (const block_q6_K *) vx;
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
|
||||||
|
@ -3474,6 +3502,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||||
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
|
||||||
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
||||||
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
|
||||||
|
(void)x_qh;
|
||||||
|
|
||||||
const float * x_dmf = (const float *) x_dm;
|
const float * x_dmf = (const float *) x_dm;
|
||||||
const float * y_df = (const float *) y_ds;
|
const float * y_df = (const float *) y_ds;
|
||||||
|
@ -3516,7 +3545,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||||
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
__shared__ int tile_y_qs[mmq_x * WARP_SIZE];
|
||||||
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
__shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
|
||||||
|
|
||||||
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {0.0f};
|
float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
|
||||||
|
|
||||||
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
|
||||||
|
|
||||||
|
@ -4581,8 +4610,8 @@ static __global__ void rope(
|
||||||
|
|
||||||
template<typename T, bool has_pos>
|
template<typename T, bool has_pos>
|
||||||
static __global__ void rope_neox(
|
static __global__ void rope_neox(
|
||||||
const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
|
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims
|
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
|
||||||
) {
|
) {
|
||||||
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||||
|
|
||||||
|
@ -4591,23 +4620,25 @@ static __global__ void rope_neox(
|
||||||
}
|
}
|
||||||
|
|
||||||
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
const int i = row*ncols + col/2;
|
const int ib = col / n_dims;
|
||||||
|
const int ic = col % n_dims;
|
||||||
|
|
||||||
|
const int i = row*ncols + ib*n_dims + ic/2;
|
||||||
const int i2 = row/p_delta_rows;
|
const int i2 = row/p_delta_rows;
|
||||||
|
|
||||||
// simplified from `(ib * ncols + col) * (-1 / ncols)`, where ib is assumed to be zero
|
float cur_rot = inv_ndims * ic - ib;
|
||||||
const float cur_rot = -float(col)/ncols;
|
|
||||||
|
|
||||||
const int p = has_pos ? pos[i2] : 0;
|
const int p = has_pos ? pos[i2] : 0;
|
||||||
const float theta_base = p*powf(freq_base, cur_rot);
|
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f);
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
const float x0 = x[i + 0];
|
const float x0 = x[i + 0];
|
||||||
const float x1 = x[i + ncols/2];
|
const float x1 = x[i + n_dims/2];
|
||||||
|
|
||||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||||
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void rope_glm_f32(
|
static __global__ void rope_glm_f32(
|
||||||
|
@ -5710,20 +5741,26 @@ static void rope_cuda(
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static void rope_neox_cuda(
|
static void rope_neox_cuda(
|
||||||
const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream
|
||||||
) {
|
) {
|
||||||
GGML_ASSERT(ncols % 2 == 0);
|
GGML_ASSERT(ncols % 2 == 0);
|
||||||
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
|
||||||
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||||
const dim3 block_nums(nrows, num_blocks_x, 1);
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
||||||
|
|
||||||
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
|
const float inv_ndims = -1.0f / n_dims;
|
||||||
|
|
||||||
if (pos == nullptr) {
|
if (pos == nullptr) {
|
||||||
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
rope_neox<T, false><<<block_nums, block_dims, 0, stream>>>(
|
||||||
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
|
theta_scale, inv_ndims
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
rope_neox<T, true><<<block_nums, block_dims, 0, stream>>>(
|
||||||
x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
|
theta_scale, inv_ndims
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5838,7 +5875,7 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
#ifdef DEBUG_CUDA_MALLOC
|
#ifdef DEBUG_CUDA_MALLOC
|
||||||
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
|
fprintf(stderr, "%s: %d buffers, max_size = %u MiB, tot_size = %u MiB, requested %u MiB\n", __func__, nnz,
|
||||||
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
|
||||||
#endif
|
#endif
|
||||||
void * ptr;
|
void * ptr;
|
||||||
|
@ -5976,7 +6013,7 @@ void * ggml_cuda_host_malloc(size_t size) {
|
||||||
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
// The allocation error can be bypassed. A null ptr will assigned out of this function.
|
||||||
// This can fixed the OOM error in WSL.
|
// This can fixed the OOM error in WSL.
|
||||||
cudaGetLastError();
|
cudaGetLastError();
|
||||||
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
fprintf(stderr, "WARNING: failed to allocate %.2f MiB of pinned memory: %s\n",
|
||||||
size/1024.0/1024.0, cudaGetErrorString(err));
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -6021,18 +6058,18 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||||
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
|
||||||
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
if (nb0 == ts && nb1 == ts*ne0/bs) {
|
||||||
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream);
|
||||||
} else if (nb0 == ts) {
|
|
||||||
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
|
||||||
} else {
|
|
||||||
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
|
||||||
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
|
||||||
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
|
||||||
// pretend the row is a matrix with cols=1
|
|
||||||
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
|
||||||
if (r != cudaSuccess) return r;
|
|
||||||
}
|
|
||||||
return cudaSuccess;
|
|
||||||
}
|
}
|
||||||
|
if (nb0 == ts) {
|
||||||
|
return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream);
|
||||||
|
}
|
||||||
|
for (int64_t i1 = 0; i1 < i1_diff; i1++) {
|
||||||
|
const void * rx = (const void *) ((const char *) x + i1*nb1);
|
||||||
|
void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
|
||||||
|
// pretend the row is a matrix with cols=1
|
||||||
|
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream);
|
||||||
|
if (r != cudaSuccess) { return r; }
|
||||||
|
}
|
||||||
|
return cudaSuccess;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_op_repeat(
|
static void ggml_cuda_op_repeat(
|
||||||
|
@ -6354,6 +6391,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
case GGML_TYPE_F32:
|
||||||
return 1;
|
return 1;
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
|
||||||
|
@ -6376,6 +6414,7 @@ static int64_t get_row_rounding(ggml_type type) {
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
return 64;
|
return 64;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
|
case GGML_TYPE_F32:
|
||||||
return 1;
|
return 1;
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
|
@ -6676,15 +6715,14 @@ inline void ggml_cuda_op_rope(
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
|
||||||
} else if (is_neox) {
|
} else if (is_neox) {
|
||||||
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
|
||||||
if (src0->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
rope_neox_cuda(
|
rope_neox_cuda(
|
||||||
(const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
(const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
attn_factor, corr_dims, main_stream
|
attn_factor, corr_dims, main_stream
|
||||||
);
|
);
|
||||||
} else if (src0->type == GGML_TYPE_F16) {
|
} else if (src0->type == GGML_TYPE_F16) {
|
||||||
rope_neox_cuda(
|
rope_neox_cuda(
|
||||||
(const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
(const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
|
||||||
attn_factor, corr_dims, main_stream
|
attn_factor, corr_dims, main_stream
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
|
@ -6985,7 +7023,7 @@ static void ggml_cuda_op_mul_mat(
|
||||||
const int64_t ne01 = src0->ne[1];
|
const int64_t ne01 = src0->ne[1];
|
||||||
const int64_t ne02 = src0->ne[2];
|
const int64_t ne02 = src0->ne[2];
|
||||||
const int64_t ne03 = src0->ne[3];
|
const int64_t ne03 = src0->ne[3];
|
||||||
const int64_t nrows0 = ggml_nrows(src0);
|
// const int64_t nrows0 = ggml_nrows(src0);
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
const int64_t ne11 = src1->ne[1];
|
const int64_t ne11 = src1->ne[1];
|
||||||
|
@ -7086,7 +7124,7 @@ static void ggml_cuda_op_mul_mat(
|
||||||
if (src0_on_device && src0_is_contiguous) {
|
if (src0_on_device && src0_is_contiguous) {
|
||||||
src0_dd[id] = (char *) src0_extra->data_device[id];
|
src0_dd[id] = (char *) src0_extra->data_device[id];
|
||||||
} else {
|
} else {
|
||||||
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
// const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
||||||
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7319,7 +7357,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||||
if (!g_cublas_loaded) return false;
|
if (!g_cublas_loaded) { return false; }
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
|
|
||||||
|
@ -7397,7 +7435,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor
|
||||||
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void k_compute_batched_ptrs(
|
__global__ static void k_compute_batched_ptrs(
|
||||||
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
const half * src0_as_f16, const half * src1_as_f16, half * dst_f16,
|
||||||
const void ** ptrs_src, void ** ptrs_dst,
|
const void ** ptrs_src, void ** ptrs_dst,
|
||||||
int ne12, int ne13,
|
int ne12, int ne13,
|
||||||
|
@ -7727,7 +7765,7 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
|
||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7842,11 +7880,11 @@ static size_t g_temp_tensor_extra_index = 0;
|
||||||
|
|
||||||
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||||
if (g_temp_tensor_extras == nullptr) {
|
if (g_temp_tensor_extras == nullptr) {
|
||||||
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE];
|
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t alloc_index = g_temp_tensor_extra_index;
|
size_t alloc_index = g_temp_tensor_extra_index;
|
||||||
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE;
|
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
||||||
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
||||||
memset(extra, 0, sizeof(*extra));
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
|
||||||
|
@ -8013,7 +8051,7 @@ void ggml_cuda_free_scratch() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||||
if (!g_cublas_loaded) return false;
|
if (!g_cublas_loaded) { return false; }
|
||||||
|
|
||||||
ggml_cuda_func_t func;
|
ggml_cuda_func_t func;
|
||||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||||
|
@ -8027,7 +8065,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||||
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -8173,11 +8211,11 @@ struct ggml_backend_buffer_context_cuda {
|
||||||
|
|
||||||
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||||
if (temp_tensor_extras == nullptr) {
|
if (temp_tensor_extras == nullptr) {
|
||||||
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE];
|
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t alloc_index = temp_tensor_extra_index;
|
size_t alloc_index = temp_tensor_extra_index;
|
||||||
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE;
|
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
||||||
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
||||||
memset(extra, 0, sizeof(*extra));
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
|
||||||
|
@ -8312,14 +8350,14 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
|
||||||
UNUSED(cgraph);
|
UNUSED(cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
[[noreturn]] static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
GGML_ASSERT(!"not implemented");
|
GGML_ASSERT(!"not implemented");
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
UNUSED(plan);
|
UNUSED(plan);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
[[noreturn]] static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||||
GGML_ASSERT(!"not implemented");
|
GGML_ASSERT(!"not implemented");
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
|
@ -8335,8 +8373,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
|
if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) {
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
assert(node->backend == GGML_BACKEND_GPU);
|
assert(node->backend == GGML_BACKEND_GPU);
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
if (node->src[j] != nullptr) {
|
if (node->src[j] != nullptr) {
|
||||||
|
|
17
ggml-metal.m
17
ggml-metal.m
|
@ -345,10 +345,10 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MiB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
if (ctx->device.maxTransferRate != 0) {
|
if (ctx->device.maxTransferRate != 0) {
|
||||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MiB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||||
} else {
|
} else {
|
||||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
|
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
|
||||||
}
|
}
|
||||||
|
@ -541,11 +541,11 @@ bool ggml_metal_add_buffer(
|
||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
||||||
++ctx->n_buffers;
|
++ctx->n_buffers;
|
||||||
} else {
|
} else {
|
||||||
|
@ -565,11 +565,11 @@ bool ggml_metal_add_buffer(
|
||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MiB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MiB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
||||||
if (i + size_step < size) {
|
if (i + size_step < size) {
|
||||||
GGML_METAL_LOG_INFO("\n");
|
GGML_METAL_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
|
@ -1433,7 +1433,8 @@ void ggml_metal_graph_compute(
|
||||||
const int n_past = ((int32_t *) dst->op_params)[0];
|
const int n_past = ((int32_t *) dst->op_params)[0];
|
||||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
const int n_dims = ((int32_t *) dst->op_params)[1];
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
const int mode = ((int32_t *) dst->op_params)[2];
|
||||||
const int n_orig_ctx = ((int32_t *) dst->op_params)[3];
|
// skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
|
||||||
|
const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
|
||||||
|
|
||||||
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
||||||
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
#ifdef __wasm_simd128__
|
#ifdef __wasm_simd128__
|
||||||
#include <wasm_simd128.h>
|
#include <wasm_simd128.h>
|
||||||
#else
|
#else
|
||||||
#ifdef __POWER9_VECTOR__
|
#if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
|
||||||
#include <altivec.h>
|
#include <altivec.h>
|
||||||
#undef bool
|
#undef bool
|
||||||
#define bool _Bool
|
#define bool _Bool
|
||||||
|
|
104
ggml.c
104
ggml.c
|
@ -9611,10 +9611,12 @@ static void ggml_compute_forward_out_prod_f32(
|
||||||
const int ith = params->ith;
|
const int ith = params->ith;
|
||||||
const int nth = params->nth;
|
const int nth = params->nth;
|
||||||
|
|
||||||
|
GGML_ASSERT(ne0 == ne00);
|
||||||
|
GGML_ASSERT(ne1 == ne10);
|
||||||
|
GGML_ASSERT(ne2 == ne02);
|
||||||
GGML_ASSERT(ne02 == ne12);
|
GGML_ASSERT(ne02 == ne12);
|
||||||
GGML_ASSERT(ne03 == ne13);
|
|
||||||
GGML_ASSERT(ne2 == ne12);
|
|
||||||
GGML_ASSERT(ne3 == ne13);
|
GGML_ASSERT(ne3 == ne13);
|
||||||
|
GGML_ASSERT(ne03 == ne13);
|
||||||
|
|
||||||
// we don't support permuted src0 or src1
|
// we don't support permuted src0 or src1
|
||||||
GGML_ASSERT(nb00 == sizeof(float));
|
GGML_ASSERT(nb00 == sizeof(float));
|
||||||
|
@ -9625,18 +9627,25 @@ static void ggml_compute_forward_out_prod_f32(
|
||||||
// GGML_ASSERT(nb1 <= nb2);
|
// GGML_ASSERT(nb1 <= nb2);
|
||||||
// GGML_ASSERT(nb2 <= nb3);
|
// GGML_ASSERT(nb2 <= nb3);
|
||||||
|
|
||||||
GGML_ASSERT(ne0 == ne00);
|
|
||||||
GGML_ASSERT(ne1 == ne10);
|
|
||||||
GGML_ASSERT(ne2 == ne02);
|
|
||||||
GGML_ASSERT(ne3 == ne03);
|
|
||||||
|
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
||||||
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
// TODO: #if defined(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
|
bool use_blas = ggml_is_matrix(src0) &&
|
||||||
|
ggml_is_matrix(src1) &&
|
||||||
|
ggml_is_contiguous(src0) &&
|
||||||
|
(ggml_is_contiguous(src1) || ggml_is_transposed(src1));
|
||||||
|
#endif
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT) {
|
if (params->type == GGML_TASK_INIT) {
|
||||||
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
|
||||||
|
if (use_blas) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -9645,6 +9654,50 @@ static void ggml_compute_forward_out_prod_f32(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
|
if (use_blas) {
|
||||||
|
if (params->ith != 0) { // All threads other than the first do no work.
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
|
||||||
|
// src0: (k,n)
|
||||||
|
// src1: (k,m)
|
||||||
|
// dst: (m,n)
|
||||||
|
//
|
||||||
|
// Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
|
||||||
|
// Also expressed as (major,minor)
|
||||||
|
// a: (m,k): so src1 transposed
|
||||||
|
// b: (k,n): so src0
|
||||||
|
// c: (m,n)
|
||||||
|
//
|
||||||
|
// However, if ggml_is_transposed(src1) is true, then
|
||||||
|
// src1->data already contains a transposed version, so sgemm mustn't
|
||||||
|
// transpose it further.
|
||||||
|
|
||||||
|
int n = src0->ne[0];
|
||||||
|
int k = src0->ne[1];
|
||||||
|
int m = src1->ne[0];
|
||||||
|
|
||||||
|
int transposeA, lda;
|
||||||
|
|
||||||
|
if (!ggml_is_transposed(src1)) {
|
||||||
|
transposeA = CblasTrans;
|
||||||
|
lda = m;
|
||||||
|
} else {
|
||||||
|
transposeA = CblasNoTrans;
|
||||||
|
lda = k;
|
||||||
|
}
|
||||||
|
|
||||||
|
float * a = (float *) ((char *) src1->data);
|
||||||
|
float * b = (float *) ((char *) src0->data);
|
||||||
|
float * c = (float *) ((char *) dst->data);
|
||||||
|
|
||||||
|
cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
// dst[:,:,:,:] = 0
|
// dst[:,:,:,:] = 0
|
||||||
// for i2,i3:
|
// for i2,i3:
|
||||||
// for i1:
|
// for i1:
|
||||||
|
@ -18073,7 +18126,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
{
|
{
|
||||||
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||||
|
|
||||||
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||||
struct gguf_kv * kv = &ctx->kv[i];
|
struct gguf_kv * kv = &ctx->kv[i];
|
||||||
|
|
||||||
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
||||||
|
@ -18120,7 +18173,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
case GGUF_TYPE_STRING:
|
case GGUF_TYPE_STRING:
|
||||||
{
|
{
|
||||||
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
||||||
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
||||||
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -18148,7 +18201,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
{
|
{
|
||||||
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||||
|
|
||||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||||
|
|
||||||
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||||
|
@ -18195,7 +18248,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
// compute the total size of the data section, taking into account the alignment
|
// compute the total size of the data section, taking into account the alignment
|
||||||
{
|
{
|
||||||
ctx->size = 0;
|
ctx->size = 0;
|
||||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||||
|
|
||||||
const int64_t ne =
|
const int64_t ne =
|
||||||
|
@ -18264,7 +18317,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
||||||
ggml_set_no_alloc(ctx_data, true);
|
ggml_set_no_alloc(ctx_data, true);
|
||||||
|
|
||||||
// create the tensors
|
// create the tensors
|
||||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
const int64_t ne[GGML_MAX_DIMS] = {
|
const int64_t ne[GGML_MAX_DIMS] = {
|
||||||
ctx->infos[i].ne[0],
|
ctx->infos[i].ne[0],
|
||||||
ctx->infos[i].ne[1],
|
ctx->infos[i].ne[1],
|
||||||
|
@ -18399,24 +18452,29 @@ int gguf_find_key(const struct gguf_context * ctx, const char * key) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
|
const char * gguf_get_key(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
return ctx->kv[key_id].key.data;
|
return ctx->kv[key_id].key.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
|
enum gguf_type gguf_get_kv_type(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
return ctx->kv[key_id].type;
|
return ctx->kv[key_id].type;
|
||||||
}
|
}
|
||||||
|
|
||||||
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
|
enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
||||||
return ctx->kv[key_id].value.arr.type;
|
return ctx->kv[key_id].value.arr.type;
|
||||||
}
|
}
|
||||||
|
|
||||||
const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
|
const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
||||||
return ctx->kv[key_id].value.arr.data;
|
return ctx->kv[key_id].value.arr.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
|
const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
||||||
struct gguf_kv * kv = &ctx->kv[key_id];
|
struct gguf_kv * kv = &ctx->kv[key_id];
|
||||||
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
|
||||||
|
@ -18424,70 +18482,90 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
|
||||||
}
|
}
|
||||||
|
|
||||||
int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
|
int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
|
||||||
return ctx->kv[key_id].value.arr.n;
|
return ctx->kv[key_id].value.arr.n;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
|
uint8_t gguf_get_val_u8(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT8);
|
||||||
return ctx->kv[key_id].value.uint8;
|
return ctx->kv[key_id].value.uint8;
|
||||||
}
|
}
|
||||||
|
|
||||||
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
|
int8_t gguf_get_val_i8(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT8);
|
||||||
return ctx->kv[key_id].value.int8;
|
return ctx->kv[key_id].value.int8;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
|
uint16_t gguf_get_val_u16(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT16);
|
||||||
return ctx->kv[key_id].value.uint16;
|
return ctx->kv[key_id].value.uint16;
|
||||||
}
|
}
|
||||||
|
|
||||||
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
|
int16_t gguf_get_val_i16(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT16);
|
||||||
return ctx->kv[key_id].value.int16;
|
return ctx->kv[key_id].value.int16;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
|
uint32_t gguf_get_val_u32(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT32);
|
||||||
return ctx->kv[key_id].value.uint32;
|
return ctx->kv[key_id].value.uint32;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
|
int32_t gguf_get_val_i32(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT32);
|
||||||
return ctx->kv[key_id].value.int32;
|
return ctx->kv[key_id].value.int32;
|
||||||
}
|
}
|
||||||
|
|
||||||
float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
|
float gguf_get_val_f32(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT32);
|
||||||
return ctx->kv[key_id].value.float32;
|
return ctx->kv[key_id].value.float32;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
|
uint64_t gguf_get_val_u64(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_UINT64);
|
||||||
return ctx->kv[key_id].value.uint64;
|
return ctx->kv[key_id].value.uint64;
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
|
int64_t gguf_get_val_i64(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_INT64);
|
||||||
return ctx->kv[key_id].value.int64;
|
return ctx->kv[key_id].value.int64;
|
||||||
}
|
}
|
||||||
|
|
||||||
double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
|
double gguf_get_val_f64(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_FLOAT64);
|
||||||
return ctx->kv[key_id].value.float64;
|
return ctx->kv[key_id].value.float64;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
|
bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_BOOL);
|
||||||
return ctx->kv[key_id].value.bool_;
|
return ctx->kv[key_id].value.bool_;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
|
const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
|
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
|
||||||
return ctx->kv[key_id].value.str.data;
|
return ctx->kv[key_id].value.str.data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
|
||||||
|
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
|
||||||
|
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
|
||||||
|
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_STRING);
|
||||||
|
return &ctx->kv[key_id].value;
|
||||||
|
}
|
||||||
|
|
||||||
int gguf_get_n_tensors(const struct gguf_context * ctx) {
|
int gguf_get_n_tensors(const struct gguf_context * ctx) {
|
||||||
return ctx->header.n_tensors;
|
return ctx->header.n_tensors;
|
||||||
}
|
}
|
||||||
|
|
1
ggml.h
1
ggml.h
|
@ -2045,6 +2045,7 @@ extern "C" {
|
||||||
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
||||||
|
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
||||||
|
|
|
@ -56,20 +56,21 @@ class Keys:
|
||||||
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
MODEL = "tokenizer.ggml.model"
|
MODEL = "tokenizer.ggml.model"
|
||||||
LIST = "tokenizer.ggml.tokens"
|
LIST = "tokenizer.ggml.tokens"
|
||||||
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
||||||
SCORES = "tokenizer.ggml.scores"
|
SCORES = "tokenizer.ggml.scores"
|
||||||
MERGES = "tokenizer.ggml.merges"
|
MERGES = "tokenizer.ggml.merges"
|
||||||
BOS_ID = "tokenizer.ggml.bos_token_id"
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
||||||
EOS_ID = "tokenizer.ggml.eos_token_id"
|
EOS_ID = "tokenizer.ggml.eos_token_id"
|
||||||
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
||||||
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
||||||
PAD_ID = "tokenizer.ggml.padding_token_id"
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
||||||
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
||||||
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
||||||
HF_JSON = "tokenizer.huggingface.json"
|
HF_JSON = "tokenizer.huggingface.json"
|
||||||
RWKV = "tokenizer.rwkv.world"
|
RWKV = "tokenizer.rwkv.world"
|
||||||
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
|
@ -221,7 +221,7 @@ class GGUFWriter:
|
||||||
if self.endianess == GGUFEndian.BIG:
|
if self.endianess == GGUFEndian.BIG:
|
||||||
tensor.byteswap(inplace=True)
|
tensor.byteswap(inplace=True)
|
||||||
if self.use_temp_file and self.temp_file is None:
|
if self.use_temp_file and self.temp_file is None:
|
||||||
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
|
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256 * 1024 * 1024)
|
||||||
fp.seek(0)
|
fp.seek(0)
|
||||||
self.temp_file = fp
|
self.temp_file = fp
|
||||||
|
|
||||||
|
@ -399,6 +399,9 @@ class GGUFWriter:
|
||||||
def add_add_eos_token(self, value: bool) -> None:
|
def add_add_eos_token(self, value: bool) -> None:
|
||||||
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
self.add_bool(Keys.Tokenizer.ADD_EOS, value)
|
||||||
|
|
||||||
|
def add_chat_template(self, value: str) -> None:
|
||||||
|
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
|
||||||
|
|
||||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||||
pack_prefix = ''
|
pack_prefix = ''
|
||||||
if not skip_pack_prefix:
|
if not skip_pack_prefix:
|
||||||
|
|
|
@ -13,6 +13,7 @@ class SpecialVocab:
|
||||||
merges: list[str]
|
merges: list[str]
|
||||||
add_special_token: dict[str, bool]
|
add_special_token: dict[str, bool]
|
||||||
special_token_ids: dict[str, int]
|
special_token_ids: dict[str, int]
|
||||||
|
chat_template: str | None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, path: str | os.PathLike[str], load_merges: bool = False,
|
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||||
|
@ -24,6 +25,7 @@ class SpecialVocab:
|
||||||
self.n_vocab = n_vocab
|
self.n_vocab = n_vocab
|
||||||
self.load_merges = load_merges
|
self.load_merges = load_merges
|
||||||
self.merges = []
|
self.merges = []
|
||||||
|
self.chat_template = None
|
||||||
if special_token_types is not None:
|
if special_token_types is not None:
|
||||||
self.special_token_types = special_token_types
|
self.special_token_types = special_token_types
|
||||||
else:
|
else:
|
||||||
|
@ -67,6 +69,10 @@ class SpecialVocab:
|
||||||
if not quiet:
|
if not quiet:
|
||||||
print(f'gguf: Setting add_{typ}_token to {value}')
|
print(f'gguf: Setting add_{typ}_token to {value}')
|
||||||
add_handler(value)
|
add_handler(value)
|
||||||
|
if self.chat_template is not None:
|
||||||
|
if not quiet:
|
||||||
|
print(f'gguf: Setting chat_template to {self.chat_template}')
|
||||||
|
gw.add_chat_template(self.chat_template)
|
||||||
|
|
||||||
def _load(self, path: Path) -> None:
|
def _load(self, path: Path) -> None:
|
||||||
self._try_load_from_tokenizer_json(path)
|
self._try_load_from_tokenizer_json(path)
|
||||||
|
@ -117,24 +123,37 @@ class SpecialVocab:
|
||||||
|
|
||||||
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
|
||||||
tokenizer_file = path / 'tokenizer.json'
|
tokenizer_file = path / 'tokenizer.json'
|
||||||
if not tokenizer_file.is_file():
|
if tokenizer_file.is_file():
|
||||||
return False
|
with open(tokenizer_file, encoding = 'utf-8') as f:
|
||||||
with open(tokenizer_file, encoding = 'utf-8') as f:
|
tokenizer = json.load(f)
|
||||||
tokenizer = json.load(f)
|
if self.load_merges:
|
||||||
if self.load_merges:
|
merges = tokenizer.get('model', {}).get('merges')
|
||||||
merges = tokenizer.get('model', {}).get('merges')
|
if isinstance(merges, list) and merges and isinstance(merges[0], str):
|
||||||
if isinstance(merges, list) and merges and isinstance(merges[0], str):
|
self.merges = merges
|
||||||
self.merges = merges
|
added_tokens = tokenizer.get('added_tokens', {})
|
||||||
|
else:
|
||||||
|
added_tokens = {}
|
||||||
tokenizer_config_file = path / 'tokenizer_config.json'
|
tokenizer_config_file = path / 'tokenizer_config.json'
|
||||||
added_tokens = tokenizer.get('added_tokens')
|
if not tokenizer_config_file.is_file():
|
||||||
if added_tokens is None or not tokenizer_config_file.is_file():
|
|
||||||
return True
|
return True
|
||||||
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
with open(tokenizer_config_file, encoding = 'utf-8') as f:
|
||||||
tokenizer_config = json.load(f)
|
tokenizer_config = json.load(f)
|
||||||
|
chat_template = tokenizer_config.get('chat_template')
|
||||||
|
if chat_template is None or isinstance(chat_template, str):
|
||||||
|
self.chat_template = chat_template
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring',
|
||||||
|
file = sys.stderr
|
||||||
|
)
|
||||||
for typ in self.special_token_types:
|
for typ in self.special_token_types:
|
||||||
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
add_entry = tokenizer_config.get(f'add_{typ}_token')
|
||||||
if isinstance(add_entry, bool):
|
if isinstance(add_entry, bool):
|
||||||
self.add_special_token[typ] = add_entry
|
self.add_special_token[typ] = add_entry
|
||||||
|
if not added_tokens:
|
||||||
|
# We will need this to get the content for the token, so if it's empty
|
||||||
|
# may as well just give up.
|
||||||
|
continue
|
||||||
entry = tokenizer_config.get(f'{typ}_token')
|
entry = tokenizer_config.get(f'{typ}_token')
|
||||||
if isinstance(entry, str):
|
if isinstance(entry, str):
|
||||||
tc_content = entry
|
tc_content = entry
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "gguf"
|
name = "gguf"
|
||||||
version = "0.5.2"
|
version = "0.6.0"
|
||||||
description = "Read and write ML models in GGUF for GGML"
|
description = "Read and write ML models in GGUF for GGML"
|
||||||
authors = ["GGML <ggml@ggml.ai>"]
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
packages = [
|
packages = [
|
||||||
|
|
|
@ -86,13 +86,14 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
|
||||||
curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
|
curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
|
||||||
else:
|
else:
|
||||||
curr["value"] = field.parts[-1].tolist()[0]
|
curr["value"] = field.parts[-1].tolist()[0]
|
||||||
for idx, tensor in enumerate(reader.tensors):
|
if not args.no_tensors:
|
||||||
tensors[tensor.name] = {
|
for idx, tensor in enumerate(reader.tensors):
|
||||||
"index": idx,
|
tensors[tensor.name] = {
|
||||||
"shape": tensor.shape.tolist(),
|
"index": idx,
|
||||||
"type": tensor.tensor_type.name,
|
"shape": tensor.shape.tolist(),
|
||||||
"offset": tensor.field.offset,
|
"type": tensor.tensor_type.name,
|
||||||
}
|
"offset": tensor.field.offset,
|
||||||
|
}
|
||||||
json.dump(result, sys.stdout)
|
json.dump(result, sys.stdout)
|
||||||
|
|
||||||
|
|
||||||
|
|
446
llama.cpp
446
llama.cpp
|
@ -91,7 +91,7 @@
|
||||||
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
#define LLAMA_ATTRIBUTE_FORMAT(...)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define LLAMA_MAX_NODES 4096
|
#define LLAMA_MAX_NODES 8192
|
||||||
|
|
||||||
//
|
//
|
||||||
// logging
|
// logging
|
||||||
|
@ -255,6 +255,8 @@ enum llm_kv {
|
||||||
LLM_KV_TOKENIZER_UNK_ID,
|
LLM_KV_TOKENIZER_UNK_ID,
|
||||||
LLM_KV_TOKENIZER_SEP_ID,
|
LLM_KV_TOKENIZER_SEP_ID,
|
||||||
LLM_KV_TOKENIZER_PAD_ID,
|
LLM_KV_TOKENIZER_PAD_ID,
|
||||||
|
LLM_KV_TOKENIZER_ADD_BOS,
|
||||||
|
LLM_KV_TOKENIZER_ADD_EOS,
|
||||||
LLM_KV_TOKENIZER_HF_JSON,
|
LLM_KV_TOKENIZER_HF_JSON,
|
||||||
LLM_KV_TOKENIZER_RWKV,
|
LLM_KV_TOKENIZER_RWKV,
|
||||||
};
|
};
|
||||||
|
@ -303,6 +305,8 @@ static std::map<llm_kv, std::string> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
{ LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
{ LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" },
|
||||||
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
{ LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" },
|
||||||
|
{ LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" },
|
||||||
|
{ LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" },
|
||||||
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
|
||||||
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
|
||||||
};
|
};
|
||||||
|
@ -600,6 +604,60 @@ static int8_t llama_rope_scaling_type_from_string(const std::string & name) {
|
||||||
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
return LLAMA_ROPE_SCALING_UNSPECIFIED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
||||||
|
switch (type) {
|
||||||
|
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
||||||
|
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
||||||
|
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
||||||
|
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
||||||
|
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
||||||
|
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
||||||
|
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
||||||
|
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
||||||
|
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
||||||
|
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
||||||
|
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
||||||
|
default: return format("unknown type %d", type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
||||||
|
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case GGUF_TYPE_STRING:
|
||||||
|
return gguf_get_val_str(ctx_gguf, i);
|
||||||
|
case GGUF_TYPE_ARRAY:
|
||||||
|
{
|
||||||
|
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||||
|
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||||
|
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "[";
|
||||||
|
for (int j = 0; j < arr_n; j++) {
|
||||||
|
if (arr_type == GGUF_TYPE_STRING) {
|
||||||
|
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
||||||
|
// escape quotes
|
||||||
|
replace_all(val, "\\", "\\\\");
|
||||||
|
replace_all(val, "\"", "\\\"");
|
||||||
|
ss << '"' << val << '"';
|
||||||
|
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
||||||
|
ss << "???";
|
||||||
|
} else {
|
||||||
|
ss << gguf_data_to_str(arr_type, data, j);
|
||||||
|
}
|
||||||
|
if (j < arr_n - 1) {
|
||||||
|
ss << ", ";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ss << "]";
|
||||||
|
return ss.str();
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// ggml helpers
|
// ggml helpers
|
||||||
//
|
//
|
||||||
|
@ -1060,6 +1118,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_state {
|
struct llama_state {
|
||||||
|
llama_state() {
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
ggml_metal_log_set_callback(log_callback, log_callback_user_data);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
// We save the log callback globally
|
// We save the log callback globally
|
||||||
ggml_log_callback log_callback = llama_log_callback_default;
|
ggml_log_callback log_callback = llama_log_callback_default;
|
||||||
void * log_callback_user_data = nullptr;
|
void * log_callback_user_data = nullptr;
|
||||||
|
@ -1083,9 +1147,9 @@ enum e_model {
|
||||||
MODEL_70B,
|
MODEL_70B,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t kB = 1024;
|
static const size_t kiB = 1024;
|
||||||
static const size_t MB = 1024*kB;
|
static const size_t MiB = 1024*kiB;
|
||||||
static const size_t GB = 1024*MB;
|
static const size_t GiB = 1024*MiB;
|
||||||
|
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
bool vocab_only;
|
bool vocab_only;
|
||||||
|
@ -1222,6 +1286,7 @@ struct llama_kv_cache {
|
||||||
// cannot be freely changed after a slot has been allocated.
|
// cannot be freely changed after a slot has been allocated.
|
||||||
uint32_t head = 0;
|
uint32_t head = 0;
|
||||||
uint32_t size = 0;
|
uint32_t size = 0;
|
||||||
|
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
||||||
|
|
||||||
// computed before each graph build
|
// computed before each graph build
|
||||||
uint32_t n = 0;
|
uint32_t n = 0;
|
||||||
|
@ -1276,6 +1341,9 @@ struct llama_vocab {
|
||||||
id special_sep_id = -1;
|
id special_sep_id = -1;
|
||||||
id special_pad_id = -1;
|
id special_pad_id = -1;
|
||||||
|
|
||||||
|
int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add.
|
||||||
|
int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add.
|
||||||
|
|
||||||
id linefeed_id = 13;
|
id linefeed_id = 13;
|
||||||
id special_prefix_id = 32007;
|
id special_prefix_id = 32007;
|
||||||
id special_middle_id = 32009;
|
id special_middle_id = 32009;
|
||||||
|
@ -1320,6 +1388,9 @@ struct llama_model {
|
||||||
|
|
||||||
int n_gpu_layers;
|
int n_gpu_layers;
|
||||||
|
|
||||||
|
// gguf metadata
|
||||||
|
std::unordered_map<std::string, std::string> gguf_kv;
|
||||||
|
|
||||||
// context
|
// context
|
||||||
struct ggml_context * ctx = NULL;
|
struct ggml_context * ctx = NULL;
|
||||||
|
|
||||||
|
@ -1440,6 +1511,7 @@ static bool llama_kv_cache_init(
|
||||||
|
|
||||||
cache.head = 0;
|
cache.head = 0;
|
||||||
cache.size = n_ctx;
|
cache.size = n_ctx;
|
||||||
|
cache.used = 0;
|
||||||
|
|
||||||
cache.cells.clear();
|
cache.cells.clear();
|
||||||
cache.cells.resize(n_ctx);
|
cache.cells.resize(n_ctx);
|
||||||
|
@ -1481,7 +1553,7 @@ static bool llama_kv_cache_init(
|
||||||
vram_kv_cache += ggml_nbytes(cache.k);
|
vram_kv_cache += ggml_nbytes(cache.k);
|
||||||
}
|
}
|
||||||
if (vram_kv_cache > 0) {
|
if (vram_kv_cache > 0) {
|
||||||
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -1541,6 +1613,8 @@ static bool llama_kv_cache_find_slot(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cache.used += n_tokens;
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1561,6 +1635,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
|
||||||
cache.cells[i].seq_id.clear();
|
cache.cells[i].seq_id.clear();
|
||||||
}
|
}
|
||||||
cache.head = 0;
|
cache.head = 0;
|
||||||
|
cache.used = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_seq_rm(
|
static void llama_kv_cache_seq_rm(
|
||||||
|
@ -1583,6 +1658,9 @@ static void llama_kv_cache_seq_rm(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (cache.cells[i].seq_id.empty()) {
|
if (cache.cells[i].seq_id.empty()) {
|
||||||
|
// keep count of the number of used cells
|
||||||
|
if (cache.cells[i].pos >= 0) cache.used--;
|
||||||
|
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
if (new_head == cache.size) new_head = i;
|
if (new_head == cache.size) new_head = i;
|
||||||
}
|
}
|
||||||
|
@ -1590,7 +1668,7 @@ static void llama_kv_cache_seq_rm(
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we freed up a slot, set head to it so searching can start there.
|
// If we freed up a slot, set head to it so searching can start there.
|
||||||
if (new_head != cache.size) cache.head = new_head;
|
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_seq_cp(
|
static void llama_kv_cache_seq_cp(
|
||||||
|
@ -1616,6 +1694,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
||||||
|
|
||||||
for (uint32_t i = 0; i < cache.size; ++i) {
|
for (uint32_t i = 0; i < cache.size; ++i) {
|
||||||
if (!cache.cells[i].has_seq_id(seq_id)) {
|
if (!cache.cells[i].has_seq_id(seq_id)) {
|
||||||
|
if (cache.cells[i].pos >= 0) cache.used--;
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
cache.cells[i].seq_id.clear();
|
cache.cells[i].seq_id.clear();
|
||||||
if (new_head == cache.size) new_head = i;
|
if (new_head == cache.size) new_head = i;
|
||||||
|
@ -1626,7 +1705,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we freed up a slot, set head to it so searching can start there.
|
// If we freed up a slot, set head to it so searching can start there.
|
||||||
if (new_head != cache.size) cache.head = new_head;
|
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_kv_cache_seq_shift(
|
static void llama_kv_cache_seq_shift(
|
||||||
|
@ -1647,6 +1726,7 @@ static void llama_kv_cache_seq_shift(
|
||||||
cache.cells[i].delta += delta;
|
cache.cells[i].delta += delta;
|
||||||
|
|
||||||
if (cache.cells[i].pos < 0) {
|
if (cache.cells[i].pos < 0) {
|
||||||
|
if (!cache.cells[i].seq_id.empty()) cache.used--;
|
||||||
cache.cells[i].pos = -1;
|
cache.cells[i].pos = -1;
|
||||||
cache.cells[i].seq_id.clear();
|
cache.cells[i].seq_id.clear();
|
||||||
if (new_head == cache.size) new_head = i;
|
if (new_head == cache.size) new_head = i;
|
||||||
|
@ -1778,10 +1858,10 @@ struct llama_model_loader {
|
||||||
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
||||||
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
||||||
ftype = LLAMA_FTYPE_ALL_F32;
|
ftype = LLAMA_FTYPE_ALL_F32;
|
||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
// this is a way to mark that we have "guessed" the file type
|
// this is a way to mark that we have "guessed" the file type
|
||||||
|
@ -1795,10 +1875,21 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_kv; i++) {
|
for (int i = 0; i < n_kv; i++) {
|
||||||
const char * name = gguf_get_key(ctx_gguf, i);
|
const char * name = gguf_get_key(ctx_gguf, i);
|
||||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||||
|
const std::string type_name =
|
||||||
|
type == GGUF_TYPE_ARRAY
|
||||||
|
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i))
|
||||||
|
: gguf_type_name(type);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-8s\n", __func__, i, name, gguf_type_name(type));
|
std::string value = gguf_kv_to_str(ctx_gguf, i);
|
||||||
|
const size_t MAX_VALUE_LEN = 40;
|
||||||
|
if (value.size() > MAX_VALUE_LEN) {
|
||||||
|
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
||||||
|
}
|
||||||
|
replace_all(value, "\n", "\\n");
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// print type counts
|
// print type counts
|
||||||
|
@ -2093,6 +2184,17 @@ static void llm_load_hparams(
|
||||||
|
|
||||||
auto & hparams = model.hparams;
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
// get metadata as string
|
||||||
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
||||||
|
enum gguf_type type = gguf_get_kv_type(ctx, i);
|
||||||
|
if (type == GGUF_TYPE_ARRAY) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const char * name = gguf_get_key(ctx, i);
|
||||||
|
const std::string value = gguf_kv_to_str(ctx, i);
|
||||||
|
model.gguf_kv.emplace(name, value);
|
||||||
|
}
|
||||||
|
|
||||||
// get general kv
|
// get general kv
|
||||||
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
GGUF_GET_KEY(ctx, model.name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_NAME));
|
||||||
|
|
||||||
|
@ -2388,6 +2490,23 @@ static void llm_load_vocab(
|
||||||
__func__, key.c_str(), id, old_id);
|
__func__, key.c_str(), id, old_id);
|
||||||
id = old_id;
|
id = old_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle add_bos_token and add_eos_token
|
||||||
|
std::string key = kv(LLM_KV_TOKENIZER_ADD_BOS);
|
||||||
|
int kid = gguf_find_key(ctx, key.c_str());
|
||||||
|
enum gguf_type ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
||||||
|
vocab.special_add_bos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
||||||
|
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
||||||
|
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
||||||
|
}
|
||||||
|
key = kv(LLM_KV_TOKENIZER_ADD_EOS);
|
||||||
|
kid = gguf_find_key(ctx, key.c_str());
|
||||||
|
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
||||||
|
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
||||||
|
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
||||||
|
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2519,8 +2638,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
||||||
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
||||||
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
||||||
if (ml.n_bytes < GB) {
|
if (ml.n_bytes < GiB) {
|
||||||
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||||
} else {
|
} else {
|
||||||
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
LLAMA_LOG_INFO("%s: model size = %.2f GiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
||||||
}
|
}
|
||||||
|
@ -2558,7 +2677,7 @@ static void llm_load_tensors(
|
||||||
|
|
||||||
ml.calc_sizes(ctx_size, mmapped_size);
|
ml.calc_sizes(ctx_size, mmapped_size);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
|
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
|
||||||
|
|
||||||
// create the ggml context
|
// create the ggml context
|
||||||
{
|
{
|
||||||
|
@ -3207,7 +3326,7 @@ static void llm_load_tensors(
|
||||||
ctx_size +
|
ctx_size +
|
||||||
mmapped_size - vram_weights; // weights in VRAM not in memory
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: mem required = %7.2f MB\n", __func__, mem_required / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: mem required = %7.2f MiB\n", __func__, mem_required / 1024.0 / 1024.0);
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
|
@ -3226,7 +3345,7 @@ static void llm_load_tensors(
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
||||||
LLAMA_LOG_INFO("%s: VRAM used: %.2f MB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: VRAM used: %.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
||||||
#else
|
#else
|
||||||
(void) n_gpu_layers;
|
(void) n_gpu_layers;
|
||||||
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
|
@ -3356,7 +3475,7 @@ static void llm_build_k_shift(
|
||||||
struct ggml_cgraph * graph,
|
struct ggml_cgraph * graph,
|
||||||
llm_rope_type type,
|
llm_rope_type type,
|
||||||
int64_t n_ctx,
|
int64_t n_ctx,
|
||||||
int64_t n_rot,
|
int n_rot,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale,
|
float freq_scale,
|
||||||
const llm_build_cb & cb) {
|
const llm_build_cb & cb) {
|
||||||
|
@ -3388,7 +3507,7 @@ static void llm_build_k_shift(
|
||||||
// we rotate only the first n_rot dimensions
|
// we rotate only the first n_rot dimensions
|
||||||
ggml_rope_custom_inplace(ctx,
|
ggml_rope_custom_inplace(ctx,
|
||||||
ggml_view_3d(ctx, kv.k,
|
ggml_view_3d(ctx, kv.k,
|
||||||
n_rot, n_head_kv, n_ctx,
|
n_embd_head, n_head_kv, n_ctx,
|
||||||
ggml_element_size(kv.k)*n_embd_head,
|
ggml_element_size(kv.k)*n_embd_head,
|
||||||
ggml_element_size(kv.k)*n_embd_gqa,
|
ggml_element_size(kv.k)*n_embd_gqa,
|
||||||
ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
|
ggml_element_size(kv.k)*n_embd_gqa*n_ctx*il),
|
||||||
|
@ -4711,92 +4830,34 @@ struct llm_build_context {
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
cb(tmpq, "tmpq", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
cb(tmpk, "tmpk", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
cb(Vcur, "Vcur", il);
|
cb(Vcur, "Vcur", il);
|
||||||
|
|
||||||
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
Qcur = ggml_rope_custom(
|
||||||
struct ggml_tensor * qrot = ggml_cont(ctx0, ggml_view_3d(
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ggml_element_size(tmpq) * n_embd_head,
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
ggml_element_size(tmpq) * n_embd_head * n_head,
|
|
||||||
0
|
|
||||||
));
|
|
||||||
cb(qrot, "qrot", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * krot = ggml_cont(ctx0, ggml_view_3d(
|
|
||||||
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
|
||||||
ggml_element_size(tmpk) * n_embd_head,
|
|
||||||
ggml_element_size(tmpk) * n_embd_head * n_head_kv,
|
|
||||||
0
|
|
||||||
));
|
|
||||||
cb(krot, "krot", il);
|
|
||||||
|
|
||||||
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
|
||||||
struct ggml_tensor * qpass = ggml_view_3d(
|
|
||||||
ctx0, tmpq, (n_embd_head - hparams.n_rot), n_head, n_tokens,
|
|
||||||
ggml_element_size(tmpq) * n_embd_head,
|
|
||||||
ggml_element_size(tmpq) * n_embd_head * n_head,
|
|
||||||
ggml_element_size(tmpq) * hparams.n_rot
|
|
||||||
);
|
|
||||||
cb(qpass, "qpass", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * kpass = ggml_view_3d(
|
|
||||||
ctx0, tmpk, (n_embd_head - hparams.n_rot), n_head_kv, n_tokens,
|
|
||||||
ggml_element_size(tmpk) * (n_embd_head),
|
|
||||||
ggml_element_size(tmpk) * (n_embd_head) * n_head_kv,
|
|
||||||
ggml_element_size(tmpk) * hparams.n_rot
|
|
||||||
);
|
|
||||||
cb(kpass, "kpass", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * qrotated = ggml_rope_custom(
|
|
||||||
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
||||||
);
|
);
|
||||||
cb(qrotated, "qrotated", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * krotated = ggml_rope_custom(
|
|
||||||
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
|
||||||
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
||||||
);
|
|
||||||
cb(krotated, "krotated", il);
|
|
||||||
|
|
||||||
// ggml currently only supports concatenation on dim=2
|
|
||||||
// so we need to permute qrot, qpass, concat, then permute back.
|
|
||||||
qrotated = ggml_cont(ctx0, ggml_permute(ctx0, qrotated, 2, 1, 0, 3));
|
|
||||||
cb(qrotated, "qrotated", il);
|
|
||||||
|
|
||||||
krotated = ggml_cont(ctx0, ggml_permute(ctx0, krotated, 2, 1, 0, 3));
|
|
||||||
cb(krotated, "krotated", il);
|
|
||||||
|
|
||||||
qpass = ggml_cont(ctx0, ggml_permute(ctx0, qpass, 2, 1, 0, 3));
|
|
||||||
cb(qpass, "qpass", il);
|
|
||||||
|
|
||||||
kpass = ggml_cont(ctx0, ggml_permute(ctx0, kpass, 2, 1, 0, 3));
|
|
||||||
cb(kpass, "kpass", il);
|
|
||||||
|
|
||||||
struct ggml_tensor * Qcur = ggml_concat(ctx0, qrotated, qpass);
|
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
struct ggml_tensor * Kcur = ggml_concat(ctx0, krotated, kpass);
|
Kcur = ggml_rope_custom(
|
||||||
cb(Kcur, "Kcur", il);
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
|
hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
struct ggml_tensor * Q = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 2, 1, 0, 3));
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
cb(Q, "Q", il);
|
);
|
||||||
|
|
||||||
Kcur = ggml_cont(ctx0, ggml_permute(ctx0, Kcur, 2, 1, 0, 3));
|
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
||||||
|
|
||||||
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Q, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
cb(cur, "kqv_out", il);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -5424,6 +5485,12 @@ static int llama_decode_internal(
|
||||||
batch.seq_id = seq_id_arr.data();
|
batch.seq_id = seq_id_arr.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if we have enough unused cells before the current head ->
|
||||||
|
// better to start searching from the beginning of the cache, hoping to fill it
|
||||||
|
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
||||||
|
kv_self.head = 0;
|
||||||
|
}
|
||||||
|
|
||||||
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
if (!llama_kv_cache_find_slot(kv_self, batch)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -5434,7 +5501,7 @@ static int llama_decode_internal(
|
||||||
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
//kv_self.n = std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)); // TODO: this might be better for CUDA?
|
||||||
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, llama_kv_cache_cell_max(kv_self)));
|
||||||
|
|
||||||
//printf("kv_self.n = %d\n", kv_self.n);
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
||||||
|
|
||||||
ggml_allocr_reset(lctx.alloc);
|
ggml_allocr_reset(lctx.alloc);
|
||||||
|
|
||||||
|
@ -6283,7 +6350,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
||||||
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
// by modifying llm_tokenizer_x to operate with string offsets like pre-tokenizer
|
||||||
// and passing 'add space prefix' as bool argument
|
// and passing 'add space prefix' as bool argument
|
||||||
//
|
//
|
||||||
auto raw_text = (special ? "" : " ") + fragment.raw_text.substr(fragment.offset, fragment.length);
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||||
|
if (&fragment == &fragment_buffer.front()) {
|
||||||
|
raw_text = " " + raw_text; // prefix with space if the first token is not special
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef PRETOKENIZERDEBUG
|
#ifdef PRETOKENIZERDEBUG
|
||||||
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
||||||
|
@ -6350,10 +6420,13 @@ struct llama_grammar_candidate {
|
||||||
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
||||||
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||||
const char * src,
|
const char * src,
|
||||||
|
size_t n_src,
|
||||||
llama_partial_utf8 partial_start) {
|
llama_partial_utf8 partial_start) {
|
||||||
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
||||||
const char * pos = src;
|
const char * pos = src;
|
||||||
std::vector<uint32_t> code_points;
|
std::vector<uint32_t> code_points;
|
||||||
|
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
||||||
|
code_points.reserve(n_src + 1);
|
||||||
uint32_t value = partial_start.value;
|
uint32_t value = partial_start.value;
|
||||||
int n_remain = partial_start.n_remain;
|
int n_remain = partial_start.n_remain;
|
||||||
|
|
||||||
|
@ -6404,6 +6477,13 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||||
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
||||||
|
std::string src,
|
||||||
|
llama_partial_utf8 partial_start
|
||||||
|
) {
|
||||||
|
return decode_utf8(src.c_str(), src.size(), partial_start);
|
||||||
|
}
|
||||||
|
|
||||||
// returns true iff pos points to the end of one of the definitions of a rule
|
// returns true iff pos points to the end of one of the definitions of a rule
|
||||||
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
||||||
switch (pos->type) {
|
switch (pos->type) {
|
||||||
|
@ -7053,7 +7133,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||||
} else if (piece.empty() || piece[0] == 0) {
|
} else if (piece.empty() || piece[0] == 0) {
|
||||||
candidates->data[i].logit = -INFINITY;
|
candidates->data[i].logit = -INFINITY;
|
||||||
} else {
|
} else {
|
||||||
candidates_decoded.push_back(decode_utf8(piece.c_str(), grammar->partial_utf8));
|
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
|
||||||
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -7260,7 +7340,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
const std::string piece = llama_token_to_piece(ctx, token);
|
const std::string piece = llama_token_to_piece(ctx, token);
|
||||||
|
|
||||||
// Note terminating 0 in decoded string
|
// Note terminating 0 in decoded string
|
||||||
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
||||||
const auto & code_points = decoded.first;
|
const auto & code_points = decoded.first;
|
||||||
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
||||||
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
||||||
|
@ -7935,7 +8015,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
workers.clear();
|
workers.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
||||||
int64_t tot_count = 0;
|
int64_t tot_count = 0;
|
||||||
for (size_t i = 0; i < hist_cur.size(); i++) {
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
||||||
hist_all[i] += hist_cur[i];
|
hist_all[i] += hist_cur[i];
|
||||||
|
@ -8475,7 +8555,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
{
|
{
|
||||||
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
|
||||||
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: kv self size = %7.2f MiB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// resized during inference
|
// resized during inference
|
||||||
|
@ -8505,8 +8585,6 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
|
||||||
|
|
||||||
ctx->ctx_metal = ggml_metal_init(1);
|
ctx->ctx_metal = ggml_metal_init(1);
|
||||||
if (!ctx->ctx_metal) {
|
if (!ctx->ctx_metal) {
|
||||||
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
||||||
|
@ -8520,7 +8598,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
// measure memory requirements for the graph
|
// measure memory requirements for the graph
|
||||||
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
||||||
|
|
||||||
// recreate allocator with exact memory requirements
|
// recreate allocator with exact memory requirements
|
||||||
ggml_allocr_free(ctx->alloc);
|
ggml_allocr_free(ctx->alloc);
|
||||||
|
@ -8534,7 +8612,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
#endif
|
#endif
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
ggml_cuda_set_scratch_size(alloc_size);
|
ggml_cuda_set_scratch_size(alloc_size);
|
||||||
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
||||||
|
|
||||||
// calculate total VRAM usage
|
// calculate total VRAM usage
|
||||||
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
||||||
|
@ -8554,10 +8632,10 @@ struct llama_context * llama_new_context_with_model(
|
||||||
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
||||||
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MB (model: %.2f MB, context: %.2f MB)\n", __func__,
|
LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
|
||||||
total_vram_size / 1024.0 / 1024.0,
|
total_vram_size / 1024.0 / 1024.0,
|
||||||
model_vram_size / 1024.0 / 1024.0,
|
model_vram_size / 1024.0 / 1024.0,
|
||||||
ctx_vram_size / 1024.0 / 1024.0);
|
ctx_vram_size / 1024.0 / 1024.0);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8578,7 +8656,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
|
|
||||||
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
|
LLAMA_LOG_INFO("%s: max tensor size = %8.2f MiB\n", __func__, max_size/1024.0/1024.0);
|
||||||
|
|
||||||
#define LLAMA_METAL_CHECK_BUF(result) \
|
#define LLAMA_METAL_CHECK_BUF(result) \
|
||||||
if (!(result)) { \
|
if (!(result)) { \
|
||||||
|
@ -8644,6 +8722,45 @@ float llama_rope_freq_scale_train(const struct llama_model * model) {
|
||||||
return model->hparams.rope_freq_scale_train;
|
return model->hparams.rope_freq_scale_train;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size) {
|
||||||
|
const auto & it = model->gguf_kv.find(key);
|
||||||
|
if (it == model->gguf_kv.end()) {
|
||||||
|
if (buf_size > 0) {
|
||||||
|
buf[0] = '\0';
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
int llama_model_meta_count(const struct llama_model * model) {
|
||||||
|
return (int)model->gguf_kv.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
||||||
|
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
||||||
|
if (buf_size > 0) {
|
||||||
|
buf[0] = '\0';
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
auto it = model->gguf_kv.begin();
|
||||||
|
std::advance(it, i);
|
||||||
|
return snprintf(buf, buf_size, "%s", it->first.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size) {
|
||||||
|
if (i < 0 || i >= (int)model->gguf_kv.size()) {
|
||||||
|
if (buf_size > 0) {
|
||||||
|
buf[0] = '\0';
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
auto it = model->gguf_kv.begin();
|
||||||
|
std::advance(it, i);
|
||||||
|
return snprintf(buf, buf_size, "%s", it->second.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||||
return snprintf(buf, buf_size, "%s %s %s",
|
return snprintf(buf, buf_size, "%s %s %s",
|
||||||
llama_model_arch_name(model->arch).c_str(),
|
llama_model_arch_name(model->arch).c_str(),
|
||||||
|
@ -8702,8 +8819,107 @@ int llama_model_apply_lora_from_file(const struct llama_model * model, const cha
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq) {
|
||||||
|
struct llama_kv_cache_view result = {
|
||||||
|
/*.n_cells = */ 0,
|
||||||
|
/*.n_max_seq = */ n_max_seq,
|
||||||
|
/*.token_count = */ 0,
|
||||||
|
/*.used_cells = */ llama_get_kv_cache_used_cells(ctx),
|
||||||
|
/*.max_contiguous = */ 0,
|
||||||
|
/*.max_contiguous_idx = */ -1,
|
||||||
|
/*.cells = */ nullptr,
|
||||||
|
/*.cells_sequences = */ nullptr,
|
||||||
|
};
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_kv_cache_view_free(struct llama_kv_cache_view * view) {
|
||||||
|
if (view->cells != nullptr) {
|
||||||
|
free(view->cells);
|
||||||
|
view->cells = nullptr;
|
||||||
|
}
|
||||||
|
if (view->cells_sequences != nullptr) {
|
||||||
|
free(view->cells_sequences);
|
||||||
|
view->cells_sequences = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view) {
|
||||||
|
if (uint32_t(view->n_cells) < ctx->kv_self.size || view->cells == nullptr) {
|
||||||
|
view->n_cells = int32_t(ctx->kv_self.size);
|
||||||
|
void * p = realloc(view->cells, sizeof(struct llama_kv_cache_view_cell) * view->n_cells);
|
||||||
|
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells");
|
||||||
|
view->cells = (struct llama_kv_cache_view_cell *)p;
|
||||||
|
p = realloc(view->cells_sequences, sizeof(llama_seq_id) * view->n_max_seq * view->n_cells);
|
||||||
|
GGML_ASSERT(p != nullptr && "Failed to alloc kv_cache_view cells sequences");
|
||||||
|
view->cells_sequences = (llama_seq_id *)p;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::vector<llama_kv_cell> & kv_cells = ctx->kv_self.cells;
|
||||||
|
llama_kv_cache_view_cell * c_curr = view->cells;
|
||||||
|
llama_seq_id * cs_curr = view->cells_sequences;
|
||||||
|
int32_t used_cells = 0;
|
||||||
|
int32_t token_count = 0;
|
||||||
|
int32_t curr_contig_idx = -1;
|
||||||
|
uint32_t max_contig = 0;
|
||||||
|
int32_t max_contig_idx = -1;
|
||||||
|
|
||||||
|
for (int32_t i = 0; i < int32_t(ctx->kv_self.size); i++, c_curr++, cs_curr += view->n_max_seq) {
|
||||||
|
const size_t curr_size = kv_cells[i].seq_id.size();
|
||||||
|
token_count += curr_size;
|
||||||
|
c_curr->pos = kv_cells[i].pos + kv_cells[i].delta;
|
||||||
|
|
||||||
|
if (curr_size > 0) {
|
||||||
|
if (curr_contig_idx >= 0 && uint32_t(i - curr_contig_idx) > max_contig) {
|
||||||
|
max_contig = i - curr_contig_idx;
|
||||||
|
max_contig_idx = curr_contig_idx;
|
||||||
|
}
|
||||||
|
curr_contig_idx = -1;
|
||||||
|
} else if (curr_contig_idx < 0) {
|
||||||
|
curr_contig_idx = i;
|
||||||
|
}
|
||||||
|
|
||||||
|
int seq_idx = 0;
|
||||||
|
for (const llama_seq_id it : kv_cells[i].seq_id) {
|
||||||
|
if (seq_idx >= view->n_max_seq) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
cs_curr[seq_idx] = it;
|
||||||
|
seq_idx++;
|
||||||
|
}
|
||||||
|
if (seq_idx != 0) {
|
||||||
|
used_cells++;
|
||||||
|
}
|
||||||
|
for (; seq_idx < view->n_max_seq; seq_idx++) {
|
||||||
|
cs_curr[seq_idx] = -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (curr_contig_idx >= 0 && kv_cells.size() - curr_contig_idx > max_contig) {
|
||||||
|
max_contig_idx = curr_contig_idx;
|
||||||
|
max_contig = kv_cells.size() - curr_contig_idx;
|
||||||
|
}
|
||||||
|
view->max_contiguous = max_contig;
|
||||||
|
view->max_contiguous_idx = max_contig_idx;
|
||||||
|
view->token_count = token_count;
|
||||||
|
view->used_cells = used_cells;
|
||||||
|
if (uint32_t(used_cells) != ctx->kv_self.used) {
|
||||||
|
LLAMA_LOG_ERROR("%s: used cells mismatch. kv_cache says %d but we calculated %d\n",
|
||||||
|
__func__, ctx->kv_self.used, used_cells);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
||||||
return ctx->kv_self.head;
|
int result = 0;
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < ctx->kv_self.size; i++) {
|
||||||
|
result += ctx->kv_self.cells[i].seq_id.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
int llama_get_kv_cache_used_cells(const struct llama_context * ctx) {
|
||||||
|
return ctx->kv_self.used;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_clear(struct llama_context * ctx) {
|
void llama_kv_cache_clear(struct llama_context * ctx) {
|
||||||
|
@ -8873,10 +9089,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
||||||
const size_t kv_buf_size = kv_self.buf.size;
|
const size_t kv_buf_size = kv_self.buf.size;
|
||||||
const uint32_t kv_head = kv_self.head;
|
const uint32_t kv_head = kv_self.head;
|
||||||
const uint32_t kv_size = kv_self.size;
|
const uint32_t kv_size = kv_self.size;
|
||||||
|
const uint32_t kv_used = kv_self.used;
|
||||||
|
|
||||||
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
data_ctx->write(&kv_buf_size, sizeof(kv_buf_size));
|
||||||
data_ctx->write(&kv_head, sizeof(kv_head));
|
data_ctx->write(&kv_head, sizeof(kv_head));
|
||||||
data_ctx->write(&kv_size, sizeof(kv_size));
|
data_ctx->write(&kv_size, sizeof(kv_size));
|
||||||
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
||||||
|
|
||||||
if (kv_buf_size) {
|
if (kv_buf_size) {
|
||||||
const size_t elt_size = ggml_element_size(kv_self.k);
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
||||||
|
@ -8999,10 +9217,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
size_t kv_buf_size;
|
size_t kv_buf_size;
|
||||||
uint32_t kv_head;
|
uint32_t kv_head;
|
||||||
uint32_t kv_size;
|
uint32_t kv_size;
|
||||||
|
uint32_t kv_used;
|
||||||
|
|
||||||
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size);
|
||||||
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head);
|
||||||
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size);
|
||||||
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
||||||
|
|
||||||
if (kv_buf_size) {
|
if (kv_buf_size) {
|
||||||
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
||||||
|
@ -9037,6 +9257,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
|
|
||||||
ctx->kv_self.head = kv_head;
|
ctx->kv_self.head = kv_head;
|
||||||
ctx->kv_self.size = kv_size;
|
ctx->kv_self.size = kv_size;
|
||||||
|
ctx->kv_self.used = kv_used;
|
||||||
|
|
||||||
ctx->kv_self.cells.resize(kv_size);
|
ctx->kv_self.cells.resize(kv_size);
|
||||||
|
|
||||||
|
@ -9285,6 +9506,14 @@ llama_token llama_token_nl(const struct llama_model * model) {
|
||||||
return model->vocab.linefeed_id;
|
return model->vocab.linefeed_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int llama_add_bos_token(const struct llama_model * model) {
|
||||||
|
return model->vocab.special_add_bos;
|
||||||
|
}
|
||||||
|
|
||||||
|
int llama_add_eos_token(const struct llama_model * model) {
|
||||||
|
return model->vocab.special_add_eos;
|
||||||
|
}
|
||||||
|
|
||||||
llama_token llama_token_prefix(const struct llama_model * model) {
|
llama_token llama_token_prefix(const struct llama_model * model) {
|
||||||
return model->vocab.special_prefix_id;
|
return model->vocab.special_prefix_id;
|
||||||
}
|
}
|
||||||
|
@ -9491,6 +9720,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
||||||
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
||||||
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
||||||
g_state.log_callback_user_data = user_data;
|
g_state.log_callback_user_data = user_data;
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|
||||||
|
|
82
llama.h
82
llama.h
|
@ -185,7 +185,7 @@ extern "C" {
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||||
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
|
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
||||||
float yarn_attn_factor; // YaRN magnitude scaling factor
|
float yarn_attn_factor; // YaRN magnitude scaling factor
|
||||||
float yarn_beta_fast; // YaRN low correction dim
|
float yarn_beta_fast; // YaRN low correction dim
|
||||||
float yarn_beta_slow; // YaRN high correction dim
|
float yarn_beta_slow; // YaRN high correction dim
|
||||||
|
@ -301,6 +301,23 @@ extern "C" {
|
||||||
// Get the model's RoPE frequency scaling factor
|
// Get the model's RoPE frequency scaling factor
|
||||||
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Functions to access the model's GGUF metadata scalar values
|
||||||
|
// - The functions return the length of the string on success, or -1 on failure
|
||||||
|
// - The output string is always null-terminated and cleared on failure
|
||||||
|
// - GGUF array values are not supported by these functions
|
||||||
|
|
||||||
|
// Get metadata value as a string by key name
|
||||||
|
LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
|
||||||
|
|
||||||
|
// Get the number of metadata key/value pairs
|
||||||
|
LLAMA_API int llama_model_meta_count(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Get metadata key name by index
|
||||||
|
LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
||||||
|
|
||||||
|
// Get metadata value as a string by index
|
||||||
|
LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
||||||
|
|
||||||
// Get a string describing the model type
|
// Get a string describing the model type
|
||||||
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
||||||
|
|
||||||
|
@ -344,9 +361,60 @@ extern "C" {
|
||||||
// KV cache
|
// KV cache
|
||||||
//
|
//
|
||||||
|
|
||||||
// Returns the number of tokens in the KV cache
|
// Information associated with an individual cell in the KV cache view.
|
||||||
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
struct llama_kv_cache_view_cell {
|
||||||
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
// The position for this cell. Takes KV cache shifts into account.
|
||||||
|
// May be negative if the cell is not populated.
|
||||||
|
llama_pos pos;
|
||||||
|
};
|
||||||
|
|
||||||
|
// An updateable view of the KV cache.
|
||||||
|
struct llama_kv_cache_view {
|
||||||
|
// Number of KV cache cells. This will be the same as the context size.
|
||||||
|
int32_t n_cells;
|
||||||
|
|
||||||
|
// Maximum number of sequences that can exist in a cell. It's not an error
|
||||||
|
// if there are more sequences in a cell than this value, however they will
|
||||||
|
// not be visible in the view cells_sequences.
|
||||||
|
int32_t n_max_seq;
|
||||||
|
|
||||||
|
// Number of tokens in the cache. For example, if there are two populated
|
||||||
|
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
||||||
|
// ids then you'll have 3 tokens.
|
||||||
|
int32_t token_count;
|
||||||
|
|
||||||
|
// Number of populated cache cells.
|
||||||
|
int32_t used_cells;
|
||||||
|
|
||||||
|
// Maximum contiguous empty slots in the cache.
|
||||||
|
int32_t max_contiguous;
|
||||||
|
|
||||||
|
// Index to the start of the max_contiguous slot range. Can be negative
|
||||||
|
// when cache is full.
|
||||||
|
int32_t max_contiguous_idx;
|
||||||
|
|
||||||
|
// Information for an individual cell.
|
||||||
|
struct llama_kv_cache_view_cell * cells;
|
||||||
|
|
||||||
|
// The sequences for each cell. There will be n_max_seq items per cell.
|
||||||
|
llama_seq_id * cells_sequences;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Create an empty KV cache view. (use only for debugging purposes)
|
||||||
|
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
|
||||||
|
|
||||||
|
// Free a KV cache view. (use only for debugging purposes)
|
||||||
|
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
||||||
|
|
||||||
|
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
||||||
|
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
||||||
|
|
||||||
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
||||||
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
||||||
|
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
||||||
|
LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
||||||
|
|
||||||
// Clear the KV cache
|
// Clear the KV cache
|
||||||
LLAMA_API void llama_kv_cache_clear(
|
LLAMA_API void llama_kv_cache_clear(
|
||||||
|
@ -517,6 +585,12 @@ extern "C" {
|
||||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||||
|
|
||||||
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||||
|
LLAMA_API int llama_add_bos_token(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||||
|
LLAMA_API int llama_add_eos_token(const struct llama_model * model);
|
||||||
|
|
||||||
// codellama infill tokens
|
// codellama infill tokens
|
||||||
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
||||||
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
# tests with BPE tokenizer
|
# tests with BPE tokenizer
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
@ -16,34 +14,34 @@ dir_tokenizer = args.dir_tokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
|
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
|
||||||
|
|
||||||
tests = [
|
tests = [
|
||||||
"",
|
"",
|
||||||
" ",
|
" ",
|
||||||
" ",
|
" ",
|
||||||
" ",
|
" ",
|
||||||
"\t",
|
"\t",
|
||||||
"\n",
|
"\n",
|
||||||
"\t\n",
|
"\t\n",
|
||||||
"Hello world",
|
"Hello world",
|
||||||
" Hello world",
|
" Hello world",
|
||||||
"Hello World",
|
"Hello World",
|
||||||
" Hello World",
|
" Hello World",
|
||||||
" Hello World!",
|
" Hello World!",
|
||||||
"Hello, world!",
|
"Hello, world!",
|
||||||
" Hello, world!",
|
" Hello, world!",
|
||||||
" this is 🦙.cpp",
|
" this is 🦙.cpp",
|
||||||
"w048 7tuijk dsdfhu",
|
"w048 7tuijk dsdfhu",
|
||||||
"нещо на Български",
|
"нещо на Български",
|
||||||
"កាន់តែពិសេសអាចខលចេញ",
|
"កាន់តែពិសេសអាចខលចេញ",
|
||||||
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||||
"Hello",
|
"Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello\n Hello",
|
" Hello\n Hello",
|
||||||
"\n =",
|
"\n =",
|
||||||
"' era",
|
"' era",
|
||||||
]
|
]
|
||||||
|
|
||||||
for text in tests:
|
for text in tests:
|
||||||
print('text: ', text)
|
print('text: ', text)
|
||||||
|
|
|
@ -1,7 +1,5 @@
|
||||||
# tests with SPM tokenizer
|
# tests with SPM tokenizer
|
||||||
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
@ -16,32 +14,32 @@ dir_tokenizer = args.dir_tokenizer
|
||||||
tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
|
tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model')
|
||||||
|
|
||||||
tests = [
|
tests = [
|
||||||
"",
|
"",
|
||||||
" ",
|
" ",
|
||||||
" ",
|
" ",
|
||||||
" ",
|
" ",
|
||||||
"\t",
|
"\t",
|
||||||
"\n",
|
"\n",
|
||||||
"\t\n",
|
"\t\n",
|
||||||
"Hello world",
|
"Hello world",
|
||||||
" Hello world",
|
" Hello world",
|
||||||
"Hello World",
|
"Hello World",
|
||||||
" Hello World",
|
" Hello World",
|
||||||
" Hello World!",
|
" Hello World!",
|
||||||
"Hello, world!",
|
"Hello, world!",
|
||||||
" Hello, world!",
|
" Hello, world!",
|
||||||
" this is 🦙.cpp",
|
" this is 🦙.cpp",
|
||||||
"w048 7tuijk dsdfhu",
|
"w048 7tuijk dsdfhu",
|
||||||
"нещо на Български",
|
"нещо на Български",
|
||||||
"កាន់តែពិសេសអាចខលចេញ",
|
"កាន់តែពិសេសអាចខលចេញ",
|
||||||
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||||
"Hello",
|
"Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello",
|
" Hello",
|
||||||
" Hello\n Hello",
|
" Hello\n Hello",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
for text in tests:
|
for text in tests:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue