Merge branch 'master' into vulkan-build-integration
This commit is contained in:
commit
f47829aa27
12 changed files with 198 additions and 31 deletions
4
.github/labeler.yml
vendored
4
.github/labeler.yml
vendored
|
@ -16,7 +16,9 @@ SYCL:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml/include/ggml-sycl.h
|
- ggml/include/ggml-sycl.h
|
||||||
- ggml/src/ggml-sycl.cpp
|
- ggml/src/ggml-sycl.cpp
|
||||||
- README-sycl.md
|
- ggml/src/ggml-sycl/**
|
||||||
|
- docs/backend/SYCL.md
|
||||||
|
- examples/sycl/**
|
||||||
Nvidia GPU:
|
Nvidia GPU:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
|
|
@ -50,9 +50,6 @@ endif()
|
||||||
# option list
|
# option list
|
||||||
#
|
#
|
||||||
|
|
||||||
# general
|
|
||||||
option(LLAMA_CCACHE "llama: use ccache if available" ON)
|
|
||||||
|
|
||||||
# debug
|
# debug
|
||||||
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
||||||
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
|
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
|
||||||
|
@ -77,7 +74,6 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||||
|
|
||||||
# override ggml options
|
# override ggml options
|
||||||
set(GGML_CCACHE ${LLAMA_CCACHE})
|
|
||||||
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
|
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
|
||||||
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
|
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
|
||||||
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
|
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
|
||||||
|
@ -115,7 +111,10 @@ llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
|
||||||
# build the library
|
# build the library
|
||||||
#
|
#
|
||||||
|
|
||||||
add_subdirectory(ggml)
|
if (NOT TARGET ggml)
|
||||||
|
add_subdirectory(ggml)
|
||||||
|
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||||
|
endif()
|
||||||
add_subdirectory(src)
|
add_subdirectory(src)
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
90
Makefile
90
Makefile
|
@ -64,10 +64,14 @@ TEST_TARGETS = \
|
||||||
tests/test-tokenizer-1-spm
|
tests/test-tokenizer-1-spm
|
||||||
|
|
||||||
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||||
LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
|
||||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
||||||
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
|
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm
|
||||||
|
|
||||||
|
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
|
||||||
|
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
|
||||||
|
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune
|
||||||
|
|
||||||
# Deprecation aliases
|
# Deprecation aliases
|
||||||
ifdef LLAMA_CUBLAS
|
ifdef LLAMA_CUBLAS
|
||||||
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
|
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
|
||||||
|
@ -197,7 +201,7 @@ ifdef GGML_VULKAN
|
||||||
BUILD_TARGETS += vulkan-shaders-gen
|
BUILD_TARGETS += vulkan-shaders-gen
|
||||||
endif
|
endif
|
||||||
|
|
||||||
default: $(BUILD_TARGETS)
|
default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)
|
||||||
|
|
||||||
test: $(TEST_TARGETS)
|
test: $(TEST_TARGETS)
|
||||||
@failures=0; \
|
@failures=0; \
|
||||||
|
@ -232,7 +236,7 @@ test: $(TEST_TARGETS)
|
||||||
fi
|
fi
|
||||||
@echo 'All tests passed.'
|
@echo 'All tests passed.'
|
||||||
|
|
||||||
all: $(BUILD_TARGETS) $(TEST_TARGETS)
|
all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)
|
||||||
|
|
||||||
ifdef RISCV_CROSS_COMPILE
|
ifdef RISCV_CROSS_COMPILE
|
||||||
CC := riscv64-unknown-linux-gnu-gcc
|
CC := riscv64-unknown-linux-gnu-gcc
|
||||||
|
@ -249,17 +253,22 @@ MK_CFLAGS = -std=c11 -fPIC
|
||||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
MK_CXXFLAGS = -std=c++11 -fPIC
|
||||||
MK_NVCCFLAGS = -std=c++11
|
MK_NVCCFLAGS = -std=c++11
|
||||||
|
|
||||||
ifndef LLAMA_NO_CCACHE
|
ifdef LLAMA_NO_CCACHE
|
||||||
|
GGML_NO_CCACHE := 1
|
||||||
|
DEPRECATE_WARNING := 1
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef GGML_NO_CCACHE
|
||||||
CCACHE := $(shell which ccache)
|
CCACHE := $(shell which ccache)
|
||||||
ifdef CCACHE
|
ifdef CCACHE
|
||||||
export CCACHE_SLOPPINESS = time_macros
|
export CCACHE_SLOPPINESS = time_macros
|
||||||
$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
|
$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
|
||||||
CC := $(CCACHE) $(CC)
|
CC := $(CCACHE) $(CC)
|
||||||
CXX := $(CCACHE) $(CXX)
|
CXX := $(CCACHE) $(CXX)
|
||||||
else
|
else
|
||||||
$(info I ccache not found. Consider installing it for faster compilation.)
|
$(info I ccache not found. Consider installing it for faster compilation.)
|
||||||
endif # CCACHE
|
endif # CCACHE
|
||||||
endif # LLAMA_NO_CCACHE
|
endif # GGML_NO_CCACHE
|
||||||
|
|
||||||
# clock_gettime came in POSIX.1b (1993)
|
# clock_gettime came in POSIX.1b (1993)
|
||||||
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
|
||||||
|
@ -948,6 +957,7 @@ $(info - LLAMA_NO_LLAMAFILE)
|
||||||
$(info - LLAMA_NO_ACCELERATE)
|
$(info - LLAMA_NO_ACCELERATE)
|
||||||
$(info - LLAMA_NO_OPENMP)
|
$(info - LLAMA_NO_OPENMP)
|
||||||
$(info - LLAMA_NO_METAL)
|
$(info - LLAMA_NO_METAL)
|
||||||
|
$(info - LLAMA_NO_CCACHE)
|
||||||
$(info )
|
$(info )
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
@ -1115,7 +1125,7 @@ clean:
|
||||||
rm -rvf $(BUILD_TARGETS)
|
rm -rvf $(BUILD_TARGETS)
|
||||||
rm -rvf $(TEST_TARGETS)
|
rm -rvf $(TEST_TARGETS)
|
||||||
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
|
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
|
||||||
rm -rvf $(LEGACY_TARGETS)
|
rm -rvf $(LEGACY_TARGETS_CLEAN)
|
||||||
find examples pocs -type f -name "*.o" -delete
|
find examples pocs -type f -name "*.o" -delete
|
||||||
|
|
||||||
#
|
#
|
||||||
|
@ -1511,3 +1521,69 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
|
||||||
$(OBJ_GGML)
|
$(OBJ_GGML)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
#
|
||||||
|
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
|
||||||
|
#
|
||||||
|
# Mark legacy binary targets as .PHONY so that they are always checked.
|
||||||
|
.PHONY: main quantize perplexity embedding server finetune
|
||||||
|
|
||||||
|
main: examples/deprecation-warning/deprecation-warning.cpp
|
||||||
|
ifneq (,$(wildcard main))
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
@echo "#########"
|
||||||
|
@echo "WARNING: The 'main' binary is deprecated. Please use 'llama-cli' instead."
|
||||||
|
@echo " Remove the 'main' binary to remove this warning."
|
||||||
|
@echo "#########"
|
||||||
|
endif
|
||||||
|
|
||||||
|
quantize: examples/deprecation-warning/deprecation-warning.cpp
|
||||||
|
ifneq (,$(wildcard quantize))
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
@echo "#########"
|
||||||
|
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
|
||||||
|
@echo " Remove the 'quantize' binary to remove this warning."
|
||||||
|
@echo "#########"
|
||||||
|
endif
|
||||||
|
|
||||||
|
perplexity: examples/deprecation-warning/deprecation-warning.cpp
|
||||||
|
ifneq (,$(wildcard perplexity))
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
@echo "#########"
|
||||||
|
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
|
||||||
|
@echo " Remove the 'perplexity' binary to remove this warning."
|
||||||
|
@echo "#########"
|
||||||
|
endif
|
||||||
|
|
||||||
|
embedding: examples/deprecation-warning/deprecation-warning.cpp
|
||||||
|
ifneq (,$(wildcard embedding))
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
@echo "#########"
|
||||||
|
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
|
||||||
|
@echo " Remove the 'embedding' binary to remove this warning."
|
||||||
|
@echo "#########"
|
||||||
|
endif
|
||||||
|
|
||||||
|
server: examples/deprecation-warning/deprecation-warning.cpp
|
||||||
|
ifneq (,$(wildcard server))
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
@echo "#########"
|
||||||
|
@echo "WARNING: The 'server' binary is deprecated. Please use 'llama-server' instead."
|
||||||
|
@echo " Remove the 'server' binary to remove this warning."
|
||||||
|
@echo "#########"
|
||||||
|
endif
|
||||||
|
|
||||||
|
finetune: examples/deprecation-warning/deprecation-warning.cpp
|
||||||
|
ifneq (,$(wildcard finetune))
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
||||||
|
@echo "#########"
|
||||||
|
@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
|
||||||
|
@echo " Remove the 'finetune' binary to remove this warning."
|
||||||
|
@echo "#########"
|
||||||
|
endif
|
||||||
|
|
|
@ -98,7 +98,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
|
||||||
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
|
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
|
||||||
|
|
||||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
|
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
||||||
|
|
||||||
**Multimodal models:**
|
**Multimodal models:**
|
||||||
|
|
||||||
|
@ -453,7 +453,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
|
||||||
- [How to build](./docs/build.md)
|
- [How to build](./docs/build.md)
|
||||||
- [Running on Docker](./docs/docker.md)
|
- [Running on Docker](./docs/docker.md)
|
||||||
- [Build on Android](./docs/android.md)
|
- [Build on Android](./docs/android.md)
|
||||||
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
|
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
|
||||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||||
|
|
||||||
**Seminal papers and background on the models**
|
**Seminal papers and background on the models**
|
||||||
|
|
51
examples/deprecation-warning/README.md
Normal file
51
examples/deprecation-warning/README.md
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
# Migration notice for binary filenames
|
||||||
|
|
||||||
|
> [!IMPORTANT]
|
||||||
|
[2024 Jun 12] Binaries have been renamed w/ a `llama-` prefix. `main` is now `llama-cli`, `server` is `llama-server`, etc (https://github.com/ggerganov/llama.cpp/pull/7809)
|
||||||
|
|
||||||
|
This migration was important, but it is a breaking change that may not always be immediately obvious to users.
|
||||||
|
|
||||||
|
Please update all scripts and workflows to use the new binary names.
|
||||||
|
|
||||||
|
| Old Filename | New Filename |
|
||||||
|
| ---- | ---- |
|
||||||
|
| main | llama-cli |
|
||||||
|
| server | llama-server |
|
||||||
|
| llama-bench | llama-bench |
|
||||||
|
| embedding | llama-embedding |
|
||||||
|
| finetune | llama-finetune |
|
||||||
|
| quantize | llama-quantize |
|
||||||
|
| tokenize | llama-tokenize |
|
||||||
|
| export-lora | llama-export-lora |
|
||||||
|
| libllava.a | libllava.a |
|
||||||
|
| baby-llama | llama-baby-llama |
|
||||||
|
| batched | llama-batched |
|
||||||
|
| batched-bench | llama-batched-bench |
|
||||||
|
| benchmark-matmult | llama-benchmark-matmult |
|
||||||
|
| convert-llama2c-to-ggml | llama-convert-llama2c-to-ggml |
|
||||||
|
| eval-callback | llama-eval-callback |
|
||||||
|
| gbnf-validator | llama-gbnf-validator |
|
||||||
|
| gguf | llama-gguf |
|
||||||
|
| gguf-split | llama-gguf-split |
|
||||||
|
| gritlm | llama-gritlm |
|
||||||
|
| imatrix | llama-imatrix |
|
||||||
|
| infill | llama-infill |
|
||||||
|
| llava-cli | llama-llava-cli |
|
||||||
|
| lookahead | llama-lookahead |
|
||||||
|
| lookup | llama-lookup |
|
||||||
|
| lookup-create | llama-lookup-create |
|
||||||
|
| lookup-merge | llama-lookup-merge |
|
||||||
|
| lookup-stats | llama-lookup-stats |
|
||||||
|
| parallel | llama-parallel |
|
||||||
|
| passkey | llama-passkey |
|
||||||
|
| perplexity | llama-perplexity |
|
||||||
|
| q8dot | llama-q8dot |
|
||||||
|
| quantize-stats | llama-quantize-stats |
|
||||||
|
| retrieval | llama-retrieval |
|
||||||
|
| save-load-state | llama-save-load-state |
|
||||||
|
| simple | llama-simple |
|
||||||
|
| speculative | llama-speculative |
|
||||||
|
| train-text-from-scratch | llama-train-text-from-scratch |
|
||||||
|
| vdot | llama-vdot |
|
||||||
|
| tests/test-c.o | tests/test-c.o |
|
||||||
|
|
35
examples/deprecation-warning/deprecation-warning.cpp
Normal file
35
examples/deprecation-warning/deprecation-warning.cpp
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
// Warns users that this filename was deprecated, and provides a link for more information.
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
// Main
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
std::string filename = "main";
|
||||||
|
if (argc >= 1) {
|
||||||
|
filename = argv[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get only the program name from the full path
|
||||||
|
auto pos = filename.find_last_of('/');
|
||||||
|
if (pos != std::string::npos) {
|
||||||
|
filename = filename.substr(pos+1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Append "llama-" to the beginning of filename to get the replacemnt filename
|
||||||
|
auto replacement_filename = "llama-" + filename;
|
||||||
|
|
||||||
|
// The exception is if the filename is "main", then our replacement filename is "llama-cli"
|
||||||
|
if (filename == "main") {
|
||||||
|
replacement_filename = "llama-cli";
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
|
||||||
|
fprintf(stdout, " Please use '%s' instead.\n", replacement_filename.c_str());
|
||||||
|
fprintf(stdout, " See https://github.com/ggerganov/llama.cpp/tree/master/examples/deprecation-warning/README.md for more information.\n");
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
|
||||||
|
return EXIT_FAILURE;
|
||||||
|
}
|
|
@ -884,7 +884,8 @@ struct server_context {
|
||||||
|
|
||||||
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
bool launch_slot_with_task(server_slot & slot, const server_task & task) {
|
||||||
slot_params default_params;
|
slot_params default_params;
|
||||||
llama_sampling_params default_sparams;
|
// Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
|
||||||
|
llama_sampling_params default_sparams = params.sparams;
|
||||||
auto & data = task.data;
|
auto & data = task.data;
|
||||||
|
|
||||||
if (data.count("__oaicompat") != 0) {
|
if (data.count("__oaicompat") != 0) {
|
||||||
|
|
20
flake.lock
generated
20
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1717285511,
|
"lastModified": 1719994518,
|
||||||
"narHash": "sha256-iKzJcpdXih14qYVcZ9QC9XuZYnPc6T8YImb6dX166kw=",
|
"narHash": "sha256-pQMhCCHyQGRzdfAkdJ4cIWiw+JNuWsTX7f0ZYSyz0VY=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "2a55567fcf15b1b1c7ed712a2c6fadaec7412ea8",
|
"rev": "9227223f6d922fee3c7b190b2cc238a99527bbb7",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1719506693,
|
"lastModified": 1720031269,
|
||||||
"narHash": "sha256-C8e9S7RzshSdHB7L+v9I51af1gDM5unhJ2xO1ywxNH8=",
|
"narHash": "sha256-rwz8NJZV+387rnWpTYcXaRNvzUSnnF9aHONoJIYmiUQ=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "b2852eb9365c6de48ffb0dc2c9562591f652242a",
|
"rev": "9f4128e00b0ae8ec65918efeba59db998750ead6",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -36,14 +36,14 @@
|
||||||
},
|
},
|
||||||
"nixpkgs-lib": {
|
"nixpkgs-lib": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1717284937,
|
"lastModified": 1719876945,
|
||||||
"narHash": "sha256-lIbdfCsf8LMFloheeE6N31+BMIeixqyQWbSr2vk79EQ=",
|
"narHash": "sha256-Fm2rDDs86sHy0/1jxTOKB1118Q0O3Uc7EC0iXvXKpbI=",
|
||||||
"type": "tarball",
|
"type": "tarball",
|
||||||
"url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
|
"url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"type": "tarball",
|
"type": "tarball",
|
||||||
"url": "https://github.com/NixOS/nixpkgs/archive/eb9ceca17df2ea50a250b6b27f7bf6ab0186f198.tar.gz"
|
"url": "https://github.com/NixOS/nixpkgs/archive/5daf0514482af3f97abaefc78a6606365c9108e2.tar.gz"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"root": {
|
"root": {
|
||||||
|
|
|
@ -3658,6 +3658,10 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
|
||||||
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
use_mul_mat_q = use_mul_mat_q && (src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
||||||
#endif // SYCL_USE_XMX
|
#endif // SYCL_USE_XMX
|
||||||
|
|
||||||
|
// mmvq path is faster in the CUDA backend.
|
||||||
|
if (ctx.stream()->get_backend() == sycl::backend::ext_oneapi_cuda)
|
||||||
|
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||||
|
|
||||||
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
if (!split && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||||
// KQ single-batch
|
// KQ single-batch
|
||||||
ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
|
ggml_sycl_mul_mat_vec_p021(ctx, src0, src1, dst);
|
||||||
|
|
|
@ -55,7 +55,7 @@ static void rope_norm(
|
||||||
const int i = row*ne0 + i0;
|
const int i = row*ne0 + i0;
|
||||||
const int i2 = row/p_delta_rows;
|
const int i2 = row/p_delta_rows;
|
||||||
|
|
||||||
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
|
||||||
|
|
||||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||||
|
|
||||||
|
@ -98,7 +98,7 @@ static void rope_neox(
|
||||||
const int i = row*ne0 + i0/2;
|
const int i = row*ne0 + i0/2;
|
||||||
const int i2 = row/p_delta_rows;
|
const int i2 = row/p_delta_rows;
|
||||||
|
|
||||||
const float theta_base = pos[i2]*powf(theta_scale, i0/2.0f);
|
const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
|
||||||
|
|
||||||
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,6 @@ from typing import Any, Callable
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from numpy._typing import _Shape
|
|
||||||
from numpy.typing import DTypeLike
|
from numpy.typing import DTypeLike
|
||||||
|
|
||||||
|
|
||||||
|
@ -219,7 +218,7 @@ class LazyNumpyTensor(LazyBase):
|
||||||
_tensor_type = np.ndarray
|
_tensor_type = np.ndarray
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: _Shape) -> np.ndarray[Any, Any]:
|
def meta_with_dtype_and_shape(cls, dtype: DTypeLike, shape: tuple[int, ...]) -> np.ndarray[Any, Any]:
|
||||||
# The initial idea was to use np.nan as the fill value,
|
# The initial idea was to use np.nan as the fill value,
|
||||||
# but non-float types like np.int16 can't use that.
|
# but non-float types like np.int16 can't use that.
|
||||||
# So zero it is.
|
# So zero it is.
|
||||||
|
|
|
@ -4,7 +4,7 @@ GBNF (GGML BNF) is a format for defining [formal grammars](https://en.wikipedia.
|
||||||
|
|
||||||
## Background
|
## Background
|
||||||
|
|
||||||
[Bakus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
|
[Backus-Naur Form (BNF)](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form) is a notation for describing the syntax of formal languages like programming languages, file formats, and protocols. GBNF is an extension of BNF that primarily adds a few modern regex-like features.
|
||||||
|
|
||||||
## Basics
|
## Basics
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue