Merge branch 'ggerganov:master' into gguf-model-template

This commit is contained in:
Austin 2024-07-28 18:57:52 -04:00 committed by GitHub
commit 964ee4b2ca
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
62 changed files with 2130 additions and 1695 deletions

View file

@ -860,7 +860,8 @@ jobs:
mkdir build mkdir build
cd build cd build
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Determine tag name - name: Determine tag name
id: tag id: tag

1
.gitignore vendored
View file

@ -50,6 +50,7 @@ build*
!docs/build.md !docs/build.md
/libllama.so /libllama.so
/llama-* /llama-*
/vulkan-shaders-gen
android-ndk-* android-ndk-*
arm_neon.h arm_neon.h
cmake-build-* cmake-build-*

View file

@ -325,9 +325,9 @@ ifdef LLAMA_DEBUG
endif endif
else else
MK_CPPFLAGS += -DNDEBUG MK_CPPFLAGS += -DNDEBUG
MK_CFLAGS += -O3 MK_CFLAGS += -O3 -g
MK_CXXFLAGS += -O3 MK_CXXFLAGS += -O3 -g
MK_NVCCFLAGS += -O3 MK_NVCCFLAGS += -O3 -g
endif endif
ifdef LLAMA_SANITIZE_THREAD ifdef LLAMA_SANITIZE_THREAD
@ -528,10 +528,21 @@ ifndef GGML_NO_ACCELERATE
endif endif
endif # GGML_NO_ACCELERATE endif # GGML_NO_ACCELERATE
ifdef GGML_MUSA
CC := clang
CXX := clang++
GGML_CUDA := 1
MK_CPPFLAGS += -DGGML_USE_MUSA
endif
ifndef GGML_NO_OPENMP ifndef GGML_NO_OPENMP
MK_CPPFLAGS += -DGGML_USE_OPENMP MK_CPPFLAGS += -DGGML_USE_OPENMP
MK_CFLAGS += -fopenmp MK_CFLAGS += -fopenmp
MK_CXXFLAGS += -fopenmp MK_CXXFLAGS += -fopenmp
ifdef GGML_MUSA
MK_CPPFLAGS += -I/usr/lib/llvm-10/include/openmp
MK_LDFLAGS += -L/usr/lib/llvm-10/lib
endif # GGML_MUSA
endif # GGML_NO_OPENMP endif # GGML_NO_OPENMP
ifdef GGML_OPENBLAS ifdef GGML_OPENBLAS
@ -582,15 +593,27 @@ else
endif # GGML_CUDA_FA_ALL_QUANTS endif # GGML_CUDA_FA_ALL_QUANTS
ifdef GGML_CUDA ifdef GGML_CUDA
ifneq ('', '$(wildcard /opt/cuda)') ifdef GGML_MUSA
CUDA_PATH ?= /opt/cuda ifneq ('', '$(wildcard /opt/musa)')
else CUDA_PATH ?= /opt/musa
CUDA_PATH ?= /usr/local/cuda else
endif CUDA_PATH ?= /usr/local/musa
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
MK_NVCCFLAGS += -use_fast_math MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
else
ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda
else
CUDA_PATH ?= /usr/local/cuda
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
MK_NVCCFLAGS += -use_fast_math
endif # GGML_MUSA
OBJ_GGML += ggml/src/ggml-cuda.o OBJ_GGML += ggml/src/ggml-cuda.o
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
@ -600,9 +623,11 @@ ifdef LLAMA_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings MK_NVCCFLAGS += -Werror all-warnings
endif # LLAMA_FATAL_WARNINGS endif # LLAMA_FATAL_WARNINGS
ifndef GGML_MUSA
ifndef JETSON_EOL_MODULE_DETECT ifndef JETSON_EOL_MODULE_DETECT
MK_NVCCFLAGS += --forward-unknown-to-host-compiler MK_NVCCFLAGS += --forward-unknown-to-host-compiler
endif # JETSON_EOL_MODULE_DETECT endif # JETSON_EOL_MODULE_DETECT
endif # GGML_MUSA
ifdef LLAMA_DEBUG ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo MK_NVCCFLAGS += -lineinfo
@ -615,8 +640,12 @@ endif # GGML_CUDA_DEBUG
ifdef GGML_CUDA_NVCC ifdef GGML_CUDA_NVCC
NVCC = $(CCACHE) $(GGML_CUDA_NVCC) NVCC = $(CCACHE) $(GGML_CUDA_NVCC)
else else
NVCC = $(CCACHE) nvcc ifdef GGML_MUSA
endif #GGML_CUDA_NVCC NVCC = $(CCACHE) mcc
else
NVCC = $(CCACHE) nvcc
endif # GGML_MUSA
endif # GGML_CUDA_NVCC
ifdef CUDA_DOCKER_ARCH ifdef CUDA_DOCKER_ARCH
MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) MK_NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
@ -687,9 +716,15 @@ define NVCC_COMPILE
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE endef # NVCC_COMPILE
else else
ifdef GGML_MUSA
define NVCC_COMPILE
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -c $< -o $@
endef # NVCC_COMPILE
else
define NVCC_COMPILE define NVCC_COMPILE
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endef # NVCC_COMPILE endef # NVCC_COMPILE
endif # GGML_MUSA
endif # JETSON_EOL_MODULE_DETECT endif # JETSON_EOL_MODULE_DETECT
ggml/src/ggml-cuda/%.o: \ ggml/src/ggml-cuda/%.o: \
@ -944,6 +979,7 @@ $(info I CXX: $(shell $(CXX) --version | head -n 1))
ifdef GGML_CUDA ifdef GGML_CUDA
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) $(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
ifndef GGML_MUSA
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
ifndef CUDA_DOCKER_ARCH ifndef CUDA_DOCKER_ARCH
@ -953,6 +989,7 @@ endif # CUDA_POWER_ARCH
endif # CUDA_DOCKER_ARCH endif # CUDA_DOCKER_ARCH
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
endif # GGML_MUSA
endif # GGML_CUDA endif # GGML_CUDA
$(info ) $(info )

View file

@ -409,6 +409,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
| [BLAS](./docs/build.md#blas-build) | All | | [BLAS](./docs/build.md#blas-build) | All |
| [BLIS](./docs/backend/BLIS.md) | All | | [BLIS](./docs/backend/BLIS.md) | All |
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU | | [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
| [CUDA](./docs/build.md#cuda) | Nvidia GPU | | [CUDA](./docs/build.md#cuda) | Nvidia GPU |
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU | | [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
| [Vulkan](./docs/build.md#vulkan) | GPU | | [Vulkan](./docs/build.md#vulkan) | GPU |

View file

@ -1324,6 +1324,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
else { invalid_param = true; } else { invalid_param = true; }
return true; return true;
} }
if (arg == "--no-warmup") {
params.warmup = false;
return true;
}
#ifndef LOG_DISABLE_LOGS #ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters // Parse args for logging parameters
if (log_param_single_parse(argv[i])) { if (log_param_single_parse(argv[i])) {
@ -1446,6 +1450,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" }); options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" }); options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" }); options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
options.push_back({ "main", " --no-warmup", "skip warming up the model with an empty run" });
options.push_back({ "server infill", options.push_back({ "server infill",
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" }); " --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });

View file

@ -1570,6 +1570,34 @@ class LlamaModel(Model):
return [(self.map_tensor_name(name), data_torch)] return [(self.map_tensor_name(name), data_torch)]
def prepare_tensors(self): def prepare_tensors(self):
if rope_scaling := self.find_hparam(["rope_scaling"], optional=True):
if rope_scaling.get("rope_type", '').lower() == "llama3":
base = self.hparams.get("rope_theta", 10000.0)
dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"]
freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim))
factor = rope_scaling.get("factor", 8.0)
low_freq_factor = rope_scaling.get("low_freq_factor", 1.0)
high_freq_factor = rope_scaling.get("high_freq_factor", 4.0)
old_context_len = self.hparams.get("original_max_position_embeddings", 8192)
low_freq_wavelen = old_context_len / low_freq_factor
high_freq_wavelen = old_context_len / high_freq_factor
assert low_freq_wavelen != high_freq_wavelen
rope_factors = []
for freq in freqs:
wavelen = 2 * math.pi / freq
if wavelen < high_freq_wavelen:
rope_factors.append(1)
elif wavelen > low_freq_wavelen:
rope_factors.append(factor)
else:
smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor)
rope_factors.append(1 / ((1 - smooth) / factor + smooth))
self.gguf_writer.add_tensor(self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), np.array(rope_factors, dtype=np.float32))
super().prepare_tensors() super().prepare_tensors()
if self._experts is not None: if self._experts is not None:

View file

@ -192,6 +192,19 @@ The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/c
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | | GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. | | GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
### MUSA
- Using `make`:
```bash
make GGML_MUSA=1
```
- Using `CMake`:
```bash
cmake -B build -DGGML_MUSA=ON
cmake --build build --config Release
```
### hipBLAS ### hipBLAS
This provides BLAS acceleration on HIP-supported AMD GPUs. This provides BLAS acceleration on HIP-supported AMD GPUs.

View file

@ -62,7 +62,7 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
} else if (type == GGML_TYPE_I8) { } else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) &data[i]; v = (float) *(int8_t *) &data[i];
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
printf("%12.4f", v); printf("%12.4f", v);
sum += v; sum += v;

View file

@ -127,7 +127,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
} }
else if (e.values.size() != (size_t)src1->ne[0]*n_as) { else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ASSERT(false); exit(1); //GGML_ABORT("fatal error");
} }
if (m_params.verbosity > 1) { if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
@ -176,7 +176,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
} }
else if (e.values.size() != (size_t)src1->ne[0]) { else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false); exit(1); //GGML_ABORT("fatal error");
} }
++e.ncall; ++e.ncall;
if (m_params.verbosity > 1) { if (m_params.verbosity > 1) {

View file

@ -150,7 +150,7 @@ static const char * output_format_str(output_formats format) {
case JSON: return "json"; case JSON: return "json";
case MARKDOWN: return "md"; case MARKDOWN: return "md";
case SQL: return "sql"; case SQL: return "sql";
default: GGML_ASSERT(!"invalid output format"); default: GGML_ABORT("invalid output format");
} }
} }
@ -176,7 +176,7 @@ static const char * split_mode_str(llama_split_mode mode) {
case LLAMA_SPLIT_MODE_NONE: return "none"; case LLAMA_SPLIT_MODE_NONE: return "none";
case LLAMA_SPLIT_MODE_LAYER: return "layer"; case LLAMA_SPLIT_MODE_LAYER: return "layer";
case LLAMA_SPLIT_MODE_ROW: return "row"; case LLAMA_SPLIT_MODE_ROW: return "row";
default: GGML_ASSERT(!"invalid split mode"); default: GGML_ABORT("invalid split mode");
} }
} }
@ -1326,7 +1326,7 @@ static std::unique_ptr<printer> create_printer(output_formats format) {
case SQL: case SQL:
return std::unique_ptr<printer>(new sql_printer()); return std::unique_ptr<printer>(new sql_printer());
} }
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
int main(int argc, char ** argv) { int main(int argc, char ** argv) {

View file

@ -869,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = peg_0; embeddings = peg_0;
} }
else { else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }

View file

@ -47,7 +47,7 @@ int main(int argc, char ** argv) {
// save state (rng, logits, embedding and kv_cache) to file // save state (rng, logits, embedding and kv_cache) to file
{ {
std::vector<uint8_t> state_mem(llama_state_get_size(ctx)); std::vector<uint8_t> state_mem(llama_state_get_size(ctx));
const size_t written = llama_state_get_data(ctx, state_mem.data()); const size_t written = llama_state_get_data(ctx, state_mem.data(), state_mem.size());
FILE *fp_write = fopen("dump_state.bin", "wb"); FILE *fp_write = fopen("dump_state.bin", "wb");
fwrite(state_mem.data(), 1, written, fp_write); fwrite(state_mem.data(), 1, written, fp_write);
@ -99,13 +99,16 @@ int main(int argc, char ** argv) {
// load state (rng, logits, embedding and kv_cache) from file // load state (rng, logits, embedding and kv_cache) from file
{ {
std::vector<uint8_t> state_mem(llama_state_get_size(ctx2)); std::vector<uint8_t> state_mem;
FILE * fp_read = fopen("dump_state.bin", "rb"); FILE * fp_read = fopen("dump_state.bin", "rb");
fseek(fp_read, 0, SEEK_END);
state_mem.resize(ftell(fp_read));
fseek(fp_read, 0, SEEK_SET);
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
fclose(fp_read); fclose(fp_read);
if (read != llama_state_set_data(ctx2, state_mem.data())) { if (read != llama_state_set_data(ctx2, state_mem.data(), state_mem.size())) {
fprintf(stderr, "\n%s : failed to read state\n", __func__); fprintf(stderr, "\n%s : failed to read state\n", __func__);
llama_free(ctx2); llama_free(ctx2);
llama_free_model(model); llama_free_model(model);
@ -159,13 +162,16 @@ int main(int argc, char ** argv) {
// load state (rng, logits, embedding and kv_cache) from file // load state (rng, logits, embedding and kv_cache) from file
{ {
std::vector<uint8_t> state_mem(llama_state_get_size(ctx3)); std::vector<uint8_t> state_mem;
FILE * fp_read = fopen("dump_state.bin", "rb"); FILE * fp_read = fopen("dump_state.bin", "rb");
fseek(fp_read, 0, SEEK_END);
state_mem.resize(ftell(fp_read));
fseek(fp_read, 0, SEEK_SET);
const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read);
fclose(fp_read); fclose(fp_read);
if (read != llama_state_set_data(ctx3, state_mem.data())) { if (read != llama_state_set_data(ctx3, state_mem.data(), state_mem.size())) {
fprintf(stderr, "\n%s : failed to read state\n", __func__); fprintf(stderr, "\n%s : failed to read state\n", __func__);
llama_free(ctx3); llama_free(ctx3);
llama_free_model(model); llama_free_model(model);
@ -182,7 +188,7 @@ int main(int argc, char ** argv) {
{ {
// save kv of seq 0 // save kv of seq 0
std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0)); std::vector<uint8_t> seq_store(llama_state_seq_get_size(ctx3, 0));
const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0); const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), seq_store.size(), 0);
if (ncopy != seq_store.size()) { if (ncopy != seq_store.size()) {
fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size());
llama_free(ctx3); llama_free(ctx3);
@ -196,7 +202,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s : kv cache cleared\n", __func__); fprintf(stderr, "%s : kv cache cleared\n", __func__);
// restore kv into seq 1 // restore kv into seq 1
const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1); const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), seq_store.size(), 1);
if (nset != seq_store.size()) { if (nset != seq_store.size()) {
fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size());
llama_free(ctx3); llama_free(ctx3);

View file

@ -163,7 +163,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) {
printf(">"); printf(">");
return; return;
} }
GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way."); GGML_ABORT("MultiByteToWideChar() failed in an unexpected way.");
} }
LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr)); LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr));

View file

@ -50,9 +50,15 @@ else()
set(GGML_BLAS_VENDOR_DEFAULT "Generic") set(GGML_BLAS_VENDOR_DEFAULT "Generic")
endif() endif()
if (CMAKE_CROSSCOMPILING)
set(GGML_NATIVE_DEFAULT OFF)
else()
set(GGML_NATIVE_DEFAULT ON)
endif()
# general # general
option(GGML_STATIC "ggml: static link libraries" OFF) option(GGML_STATIC "ggml: static link libraries" OFF)
option(GGML_NATIVE "ggml: enable -march=native flag" ON) option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
option(GGML_LTO "ggml: enable link time optimization" OFF) option(GGML_LTO "ggml: enable link time optimization" OFF)
option(GGML_CCACHE "ggml: use ccache if available" ON) option(GGML_CCACHE "ggml: use ccache if available" ON)
@ -70,7 +76,7 @@ option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF) option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
# instruction set specific # instruction set specific
if (GGML_NATIVE) if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
set(INS_ENB OFF) set(INS_ENB OFF)
else() else()
set(INS_ENB ON) set(INS_ENB ON)
@ -107,6 +113,7 @@ set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF) option(GGML_LLAMAFILE "ggml: use LLAMAFILE" OFF)
option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF) option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)

View file

@ -6,6 +6,9 @@
#ifdef GGML_USE_HIPBLAS #ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm" #define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS" #define GGML_CUBLAS_NAME "hipBLAS"
#elif defined(GGML_USE_MUSA)
#define GGML_CUDA_NAME "MUSA"
#define GGML_CUBLAS_NAME "muBLAS"
#else #else
#define GGML_CUDA_NAME "CUDA" #define GGML_CUDA_NAME "CUDA"
#define GGML_CUBLAS_NAME "cuBLAS" #define GGML_CUBLAS_NAME "cuBLAS"

View file

@ -254,18 +254,8 @@
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
#define GGML_ASSERT(x) \
do { \
if (!(x)) { \
fflush(stdout); \
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
ggml_print_backtrace(); \
abort(); \
} \
} while (0)
#ifndef NDEBUG #ifndef NDEBUG
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached") #define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
#elif defined(__GNUC__) #elif defined(__GNUC__)
#define GGML_UNREACHABLE() __builtin_unreachable() #define GGML_UNREACHABLE() __builtin_unreachable()
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
@ -274,6 +264,17 @@
#define GGML_UNREACHABLE() ((void) 0) #define GGML_UNREACHABLE() ((void) 0)
#endif #endif
#ifdef __cplusplus
#define GGML_NORETURN [[noreturn]]
#elif defined(_MSC_VER)
#define GGML_NORETURN __declspec(noreturn)
#else
#define GGML_NORETURN _Noreturn
#endif
#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
// used to copy the number of elements and stride in bytes of tensors into local variables. // used to copy the number of elements and stride in bytes of tensors into local variables.
// main purpose is to reduce code duplication and improve readability. // main purpose is to reduce code duplication and improve readability.
// //
@ -322,6 +323,9 @@
extern "C" { extern "C" {
#endif #endif
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
enum ggml_status { enum ggml_status {
GGML_STATUS_ALLOC_FAILED = -2, GGML_STATUS_ALLOC_FAILED = -2,
GGML_STATUS_FAILED = -1, GGML_STATUS_FAILED = -1,
@ -636,8 +640,11 @@ extern "C" {
GGML_CGRAPH_EVAL_ORDER_COUNT GGML_CGRAPH_EVAL_ORDER_COUNT
}; };
typedef uint32_t ggml_bitset_t;
struct ggml_hash_set { struct ggml_hash_set {
size_t size; size_t size;
ggml_bitset_t * used;
struct ggml_tensor ** keys; struct ggml_tensor ** keys;
}; };
@ -651,7 +658,7 @@ extern "C" {
struct ggml_tensor ** grads; struct ggml_tensor ** grads;
struct ggml_tensor ** leafs; struct ggml_tensor ** leafs;
struct ggml_hash_set visited_hash_table; struct ggml_hash_set visited_hash_set;
enum ggml_cgraph_eval_order order; enum ggml_cgraph_eval_order order;
}; };
@ -698,8 +705,6 @@ extern "C" {
GGML_API int64_t ggml_cycles(void); GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void); GGML_API int64_t ggml_cycles_per_ms(void);
GGML_API void ggml_print_backtrace(void);
// accepts a UTF-8 path, even on Windows // accepts a UTF-8 path, even on Windows
GGML_API FILE * ggml_fopen(const char * fname, const char * mode); GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
@ -2005,8 +2010,8 @@ extern "C" {
// ggml_graph_plan() has to be called before ggml_graph_compute() // ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data // when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context // same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);

View file

@ -139,6 +139,17 @@ if (GGML_METAL)
) )
endif() endif()
if (GGML_MUSA)
set(CMAKE_C_COMPILER clang)
set(CMAKE_C_EXTENSIONS OFF)
set(CMAKE_CXX_COMPILER clang++)
set(CMAKE_CXX_EXTENSIONS OFF)
set(GGML_CUDA ON)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA)
endif()
if (GGML_OPENMP) if (GGML_OPENMP)
find_package(OpenMP) find_package(OpenMP)
if (OpenMP_FOUND) if (OpenMP_FOUND)
@ -147,6 +158,11 @@ if (GGML_OPENMP)
add_compile_definitions(GGML_USE_OPENMP) add_compile_definitions(GGML_USE_OPENMP)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
if (GGML_MUSA)
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} "/usr/lib/llvm-10/include/openmp")
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} "/usr/lib/llvm-10/lib/libomp.so")
endif()
else() else()
message(WARNING "OpenMP not found") message(WARNING "OpenMP not found")
endif() endif()
@ -249,7 +265,13 @@ endif()
if (GGML_CUDA) if (GGML_CUDA)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
find_package(CUDAToolkit) if (GGML_MUSA)
list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/")
find_package(MUSAToolkit)
set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND})
else()
find_package(CUDAToolkit)
endif()
if (CUDAToolkit_FOUND) if (CUDAToolkit_FOUND)
message(STATUS "CUDA found") message(STATUS "CUDA found")
@ -268,7 +290,11 @@ if (GGML_CUDA)
endif() endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
enable_language(CUDA) if (GGML_MUSA)
set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE})
else()
enable_language(CUDA)
endif()
file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh") file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh")
list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h") list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h")
@ -332,21 +358,40 @@ if (GGML_CUDA)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY) add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif() endif()
if (GGML_MUSA)
set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
foreach(SOURCE ${GGML_SOURCES_CUDA})
set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_22")
endforeach()
endif()
if (GGML_STATIC) if (GGML_STATIC)
if (WIN32) if (WIN32)
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else () else ()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) if (GGML_MUSA)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart_static MUSA::mublas_static)
else()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
endif()
endif() endif()
else() else()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) if (GGML_MUSA)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musart MUSA::mublas)
else()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
endif() endif()
if (GGML_CUDA_NO_VMM) if (GGML_CUDA_NO_VMM)
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
else() else()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... if (GGML_MUSA)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
else()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
endif() endif()
else() else()
message(WARNING "CUDA not found") message(WARNING "CUDA not found")
@ -857,8 +902,10 @@ function(get_flags CCID CCVER)
set(C_FLAGS -Wdouble-promotion) set(C_FLAGS -Wdouble-promotion)
set(CXX_FLAGS -Wno-array-bounds) set(CXX_FLAGS -Wno-array-bounds)
if (CCVER VERSION_GREATER_EQUAL 7.1.0) if (NOT GGML_MUSA)
list(APPEND CXX_FLAGS -Wno-format-truncation) if (CCVER VERSION_GREATER_EQUAL 7.1.0)
list(APPEND CXX_FLAGS -Wno-format-truncation)
endif()
endif() endif()
if (CCVER VERSION_GREATER_EQUAL 8.1.0) if (CCVER VERSION_GREATER_EQUAL 8.1.0)
list(APPEND CXX_FLAGS -Wextra-semi) list(APPEND CXX_FLAGS -Wextra-semi)
@ -1264,6 +1311,7 @@ endif()
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC}) target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
target_include_directories(ggml PUBLIC ../include) target_include_directories(ggml PUBLIC ../include)
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES}) target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
target_link_directories(ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
target_compile_features (ggml PRIVATE c_std_11) # don't bump target_compile_features (ggml PRIVATE c_std_11) # don't bump
target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS}) target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})

View file

@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
GGML_ASSERT(!"not enough space in the buffer"); GGML_ABORT("not enough space in the buffer");
return;
} }
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
return; return;
} }
} }
GGML_ASSERT(!"out of allocated_tensors"); GGML_ABORT("out of allocated_tensors");
} }
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) { for (int i = 0; i < 1024; i++) {
@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
return; return;
} }
} }
fprintf(stderr, "tried to free tensor %s not found\n", tensor->name); GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
GGML_ASSERT(!"tensor not found");
} }
#endif #endif
@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
// this should never happen // this should never happen
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail); __func__, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer"); GGML_ABORT("not enough space in the buffer");
GGML_UNREACHABLE();
} }
} }
@ -443,7 +440,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
} }
} }
free(galloc->hash_set.keys); ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values); free(galloc->hash_values);
free(galloc->bufts); free(galloc->bufts);
free(galloc->buffers); free(galloc->buffers);
@ -456,7 +453,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
typedef struct ggml_gallocr * ggml_gallocr_t; typedef struct ggml_gallocr * ggml_gallocr_t;
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t); size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
return &galloc->hash_values[i]; return &galloc->hash_values[i];
} }
@ -565,8 +562,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
// clear hash tables // clear hash tables
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *)); ggml_hash_set_reset(&galloc->hash_set);
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
// allocate leafs // allocate leafs
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
@ -671,21 +668,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
} }
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
size_t hash_size = graph->visited_hash_table.size; size_t min_hash_size = graph->n_nodes + graph->n_leafs;
// add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4;
// initialize hash table // initialize hash table
if (galloc->hash_set.size < hash_size) { if (galloc->hash_set.size < min_hash_size) {
free(galloc->hash_set.keys); ggml_hash_set_free(&galloc->hash_set);
free(galloc->hash_values); galloc->hash_set = ggml_hash_set_new(min_hash_size);
galloc->hash_set.size = hash_size;
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
GGML_ASSERT(galloc->hash_set.keys != NULL); GGML_ASSERT(galloc->hash_set.keys != NULL);
free(galloc->hash_values);
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
GGML_ASSERT(galloc->hash_values != NULL); GGML_ASSERT(galloc->hash_values != NULL);
} else {
// reset hash table
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
} }
// reset allocators // reset allocators
@ -817,8 +812,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
} }
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL; size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
return talloc->size_max >= node_size; return talloc->size_max >= node_size;
} }

View file

@ -1055,11 +1055,10 @@ struct ggml_backend_sched {
ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS]; ggml_backend_buffer_type_t bufts[GGML_SCHED_MAX_BACKENDS];
ggml_gallocr_t galloc; ggml_gallocr_t galloc;
// hash keys of the nodes in the graph // hash map of the nodes in the graph
struct ggml_hash_set hash_set; struct ggml_hash_set hash_set;
// hash values int * hv_tensor_backend_ids; // [hash_set.size]
int * tensor_backend_id; struct ggml_tensor ** hv_tensor_copies; // [hash_set.size][n_backends][n_copies]
struct ggml_tensor * (* tensor_copies)[GGML_SCHED_MAX_BACKENDS][GGML_SCHED_MAX_COPIES];
int * node_backend_ids; // [graph_size] int * node_backend_ids; // [graph_size]
int * leaf_backend_ids; // [graph_size] int * leaf_backend_ids; // [graph_size]
@ -1068,7 +1067,7 @@ struct ggml_backend_sched {
int * prev_leaf_backend_ids; // [graph_size] int * prev_leaf_backend_ids; // [graph_size]
// copy of the graph with modified inputs // copy of the graph with modified inputs
struct ggml_cgraph * graph; struct ggml_cgraph graph;
// graph splits // graph splits
struct ggml_backend_sched_split * splits; struct ggml_backend_sched_split * splits;
@ -1087,19 +1086,16 @@ struct ggml_backend_sched {
ggml_backend_sched_eval_callback callback_eval; ggml_backend_sched_eval_callback callback_eval;
void * callback_eval_user_data; void * callback_eval_user_data;
bool debug; char * context_buffer;
size_t context_buffer_size;
// align context_buffer to GGML_MEM_ALIGN bool debug;
#ifdef _MSC_VER
__declspec(align(GGML_MEM_ALIGN))
#else
__attribute__((aligned(GGML_MEM_ALIGN)))
#endif
char context_buffer[GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
}; };
#define hash_id(tensor) ggml_hash_find_or_insert(sched->hash_set, tensor) #define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
#define tensor_backend_id(tensor) sched->tensor_backend_id[hash_id(tensor)] #define tensor_backend_id(tensor) sched->hv_tensor_backend_ids[hash_id(tensor)]
#define tensor_id_copy(id, backend_id, copy_id) sched->hv_tensor_copies[(id) * sched->n_backends * sched->n_copies + (backend_id) * sched->n_copies + (copy_id)]
#define tensor_copy(tensor, backend_id, copy_id) tensor_id_copy(hash_id(tensor), backend_id, copy_id)
// returns the priority of the backend, lower id is higher priority // returns the priority of the backend, lower id is higher priority
static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) { static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
@ -1169,7 +1165,6 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
return cur_backend_id; return cur_backend_id;
} }
// assign nodes that use weights to the backend of the weights
// operations with weights are preferably run on the same backend as the weights // operations with weights are preferably run on the same backend as the weights
for (int i = 0; i < GGML_MAX_SRC; i++) { for (int i = 0; i < GGML_MAX_SRC; i++) {
const struct ggml_tensor * src = tensor->src[i]; const struct ggml_tensor * src = tensor->src[i];
@ -1275,7 +1270,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->is_reset = false; sched->is_reset = false;
struct ggml_init_params params = { struct ggml_init_params params = {
/* .mem_size = */ sizeof(sched->context_buffer), /* .mem_size = */ sched->context_buffer_size,
/* .mem_buffer = */ sched->context_buffer, /* .mem_buffer = */ sched->context_buffer,
/* .no_alloc = */ true /* .no_alloc = */ true
}; };
@ -1284,39 +1279,43 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->ctx = ggml_init(params); sched->ctx = ggml_init(params);
if (sched->ctx == NULL) { if (sched->ctx == NULL) {
fprintf(stderr, "%s: failed to initialize context\n", __func__); GGML_ABORT("%s: failed to initialize context\n", __func__);
GGML_ASSERT(false);
} }
// pass 1: assign backends to ops with pre-allocated inputs // pass 1: assign backends to ops with pre-allocated inputs
for (int i = 0; i < graph->n_leafs; i++) { for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i]; struct ggml_tensor * leaf = graph->leafs[i];
int * leaf_backend_id = &tensor_backend_id(leaf); int * leaf_backend_id = &tensor_backend_id(leaf);
if (*leaf_backend_id != -1) { // do not overwrite user assignments
// do not overwrite user assignments if (*leaf_backend_id == -1) {
continue; *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
} }
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
} }
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
int * node_backend_id = &tensor_backend_id(node); int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) { // do not overwrite user assignments
// do not overwrite user assignments if (*node_backend_id == -1) {
continue; *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
}
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); #if 0
// src // src
for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->op == GGML_OP_NONE) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue; continue;
} }
int * src_backend_id = &tensor_backend_id(src);
if (*src_backend_id == -1) { for (int j = 0; j < GGML_MAX_SRC; j++) {
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src); struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
int * src_backend_id = &tensor_backend_id(src);
if (*src_backend_id == -1) {
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
}
} }
#endif
} }
} }
@ -1488,12 +1487,13 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
// pass 4: split graph, find tensors that need to be copied // pass 5: split graph, find tensors that need to be copied
{ {
int i_split = 0; int i_split = 0;
struct ggml_backend_sched_split * split = &sched->splits[0]; struct ggml_backend_sched_split * split = &sched->splits[0];
// find the backend of the first split, skipping view ops // find the backend of the first split, skipping view ops
for (int i = 0; i < graph->n_nodes; i++) { int i = 0;
for (; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
if (!ggml_is_view_op(node->op)) { if (!ggml_is_view_op(node->op)) {
split->backend_id = tensor_backend_id(node); split->backend_id = tensor_backend_id(node);
@ -1502,9 +1502,8 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
split->i_start = 0; split->i_start = 0;
split->n_inputs = 0; split->n_inputs = 0;
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
int cur_backend_id = split->backend_id; int cur_backend_id = split->backend_id;
for (int i = 0; i < graph->n_nodes; i++) { for (; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) { if (ggml_is_view_op(node->op)) {
@ -1513,7 +1512,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
const int node_backend_id = tensor_backend_id(node); const int node_backend_id = tensor_backend_id(node);
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now assert(node_backend_id != -1); // all nodes should be assigned by now
// check if we should start a new split based on the sources of the current node // check if we should start a new split based on the sources of the current node
bool need_new_split = false; bool need_new_split = false;
@ -1527,7 +1526,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// by starting a new split, the memory of the previously offloaded weights can be reused // by starting a new split, the memory of the previously offloaded weights can be reused
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = tensor_backend_id(src); int src_backend_id = tensor_backend_id(src);
if (src_backend_id != -1 && src_backend_id != cur_backend_id) { if (src_backend_id != cur_backend_id) {
need_new_split = true; need_new_split = true;
break; break;
} }
@ -1536,9 +1535,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// FIXME: count the number of inputs instead of only checking when full // FIXME: count the number of inputs instead of only checking when full
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
const size_t id = hash_id(src); const size_t id = hash_id(src);
int src_backend_id = sched->tensor_backend_id[id]; int src_backend_id = sched->hv_tensor_backend_ids[id];
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) { if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
need_new_split = true; need_new_split = true;
break; break;
@ -1570,12 +1569,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
continue; continue;
} }
const int src_backend_id = tensor_backend_id(src); size_t src_id = hash_id(src);
const int src_backend_id = sched->hv_tensor_backend_ids[src_id];
assert(src_backend_id != -1); // all inputs should be assigned by now assert(src_backend_id != -1); // all inputs should be assigned by now
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
size_t id = hash_id(src); if (tensor_id_copy(src_id, src_backend_id, 0) == NULL) {
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[src_backend_id]; ggml_backend_t backend = sched->backends[src_backend_id];
for (int c = 0; c < sched->n_copies; c++) { for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * tensor_copy; struct ggml_tensor * tensor_copy;
@ -1589,7 +1588,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_set_input(tensor_copy); ggml_set_input(tensor_copy);
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
} }
sched->tensor_copies[id][src_backend_id][c] = tensor_copy; tensor_id_copy(src_id, src_backend_id, c) = tensor_copy;
SET_CAUSE(tensor_copy, "4.cpy"); SET_CAUSE(tensor_copy, "4.cpy");
} }
int n_graph_inputs = sched->n_graph_inputs++; int n_graph_inputs = sched->n_graph_inputs++;
@ -1598,11 +1597,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
} }
} }
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id); if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
if (src_backend_id != cur_backend_id && !supported) {
// create a copy of the input in the split's backend // create a copy of the input in the split's backend
const size_t id = hash_id(src); if (tensor_id_copy(src_id, cur_backend_id, 0) == NULL) {
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[cur_backend_id]; ggml_backend_t backend = sched->backends[cur_backend_id];
for (int c = 0; c < sched->n_copies; c++) { for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
@ -1611,14 +1608,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_set_input(tensor_copy); ggml_set_input(tensor_copy);
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
} }
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy; tensor_id_copy(src_id, cur_backend_id, c) = tensor_copy;
SET_CAUSE(tensor_copy, "4.cpy"); SET_CAUSE(tensor_copy, "4.cpy");
} }
int n_inputs = split->n_inputs++; int n_inputs = split->n_inputs++;
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
split->inputs[n_inputs] = src; split->inputs[n_inputs] = src;
} }
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy]; node->src[j] = tensor_id_copy(src_id, cur_backend_id, sched->cur_copy);
} }
} }
} }
@ -1630,7 +1627,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_backend_sched_print_assignments(sched, graph); ggml_backend_sched_print_assignments(sched, graph);
} }
// swap node_backend_ids and leaf_backend_ids and prevs // swap node_backend_ids and leaf _backend_ids with prevs
{ {
int * tmp = sched->node_backend_ids; int * tmp = sched->node_backend_ids;
sched->node_backend_ids = sched->prev_node_backend_ids; sched->node_backend_ids = sched->prev_node_backend_ids;
@ -1641,9 +1638,19 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->prev_leaf_backend_ids = tmp; sched->prev_leaf_backend_ids = tmp;
} }
// create copies of the graph for each split int graph_size = graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2;
// TODO: avoid this copy if (sched->graph.size < graph_size) {
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false); sched->graph.size = graph_size;
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct ggml_tensor *));
sched->graph.leafs = realloc(sched->graph.leafs, graph_size * sizeof(struct ggml_tensor *));
GGML_ASSERT(sched->graph.nodes != NULL);
GGML_ASSERT(sched->graph.leafs != NULL);
}
sched->graph.n_nodes = 0;
sched->graph.n_leafs = 0;
struct ggml_cgraph * graph_copy = &sched->graph;
for (int i = 0; i < sched->n_splits; i++) { for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &sched->splits[i]; struct ggml_backend_sched_split * split = &sched->splits[i];
split->graph = ggml_graph_view(graph, split->i_start, split->i_end); split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
@ -1654,12 +1661,12 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input = split->inputs[j];
const size_t input_id = hash_id(input); const size_t input_id = hash_id(input);
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy]; struct ggml_tensor * input_cpy = tensor_id_copy(input_id, split->backend_id, sched->cur_copy);
// add a dependency to the input source so that it is not freed before the copy is done // add a dependency to the input source so that it is not freed before the copy is done
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
input_dep->src[0] = input; input_dep->src[0] = input;
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id]; sched->node_backend_ids[graph_copy->n_nodes] = sched->hv_tensor_backend_ids[input_id];
graph_copy->nodes[graph_copy->n_nodes++] = input_dep; graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
// add a dependency to the input copy so that it is allocated at the start of the split // add a dependency to the input copy so that it is allocated at the start of the split
@ -1681,7 +1688,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
size_t id = hash_id(input); size_t id = hash_id(input);
int backend_id = tensor_backend_id(input); int backend_id = tensor_backend_id(input);
for (int c = 0; c < sched->n_copies; c++) { for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c]; struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
} }
@ -1694,7 +1701,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input = split->inputs[j];
size_t id = hash_id(input); size_t id = hash_id(input);
for (int c = 0; c < sched->n_copies; c++) { for (int c = 0; c < sched->n_copies; c++) {
struct ggml_tensor * input_cpy = sched->tensor_copies[id][backend_id][c]; struct ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id; sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy; graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
} }
@ -1708,13 +1715,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf); sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
graph_copy->leafs[graph_copy->n_leafs++] = leaf; graph_copy->leafs[graph_copy->n_leafs++] = leaf;
} }
sched->graph = graph_copy;
} }
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
bool backend_ids_changed = false; bool backend_ids_changed = false;
for (int i = 0; i < sched->graph->n_nodes; i++) { for (int i = 0; i < sched->graph.n_nodes; i++) {
if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] && if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i] &&
sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) { sched->bufts[sched->node_backend_ids[i]] != sched->bufts[sched->prev_node_backend_ids[i]]) {
backend_ids_changed = true; backend_ids_changed = true;
@ -1722,7 +1727,7 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
} }
} }
if (!backend_ids_changed) { if (!backend_ids_changed) {
for (int i = 0; i < sched->graph->n_leafs; i++) { for (int i = 0; i < sched->graph.n_leafs; i++) {
if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] && if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i] &&
sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) { sched->bufts[sched->leaf_backend_ids[i]] != sched->bufts[sched->prev_leaf_backend_ids[i]]) {
backend_ids_changed = true; backend_ids_changed = true;
@ -1732,14 +1737,14 @@ static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
} }
// allocate graph // allocate graph
if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
// the re-allocation may cause the split inputs to be moved to a different address // the re-allocation may cause the split inputs to be moved to a different address
ggml_backend_sched_synchronize(sched); ggml_backend_sched_synchronize(sched);
#ifndef NDEBUG #ifndef NDEBUG
fprintf(stderr, "%s: failed to allocate graph, reserving\n", __func__); fprintf(stderr, "%s: failed to allocate graph, reserving (backend_ids_changed = %d)\n", __func__, backend_ids_changed);
#endif #endif
ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids); ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids);
if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { if (!ggml_gallocr_alloc_graph(sched->galloc, &sched->graph)) {
fprintf(stderr, "%s: failed to allocate graph\n", __func__); fprintf(stderr, "%s: failed to allocate graph\n", __func__);
return false; return false;
} }
@ -1760,7 +1765,7 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
for (int j = 0; j < split->n_inputs; j++) { for (int j = 0; j < split->n_inputs; j++) {
ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]); ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[j]);
struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input = split->inputs[j];
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id][sched->cur_copy]; struct ggml_tensor * input_cpy = tensor_copy(input, split_backend_id, sched->cur_copy);
if (input->flags & GGML_TENSOR_FLAG_INPUT) { if (input->flags & GGML_TENSOR_FLAG_INPUT) {
// inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done
@ -1846,21 +1851,23 @@ ggml_backend_sched_t ggml_backend_sched_new(
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
sched->debug = getenv("GGML_SCHED_DEBUG") != NULL; sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
sched->n_backends = n_backends;
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
// initialize hash table // initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size); // FIXME: needs to be size*2 to account for leafs (do it in graph_split instead)
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0])); sched->hash_set = ggml_hash_set_new(graph_size);
sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0])); sched->hv_tensor_backend_ids = malloc(sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
sched->hv_tensor_copies = malloc(sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0])); sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0])); sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
sched->n_backends = n_backends; sched->context_buffer_size = GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + ggml_graph_overhead_custom(graph_size, false);
sched->context_buffer = malloc(sched->context_buffer_size);
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
const int initial_splits_capacity = 16; const int initial_splits_capacity = 16;
sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0])); sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0]));
@ -1895,37 +1902,37 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
} }
ggml_gallocr_free(sched->galloc); ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx); ggml_free(sched->ctx);
ggml_hash_set_free(&sched->hash_set);
free(sched->splits); free(sched->splits);
free(sched->hash_set.keys); free(sched->hv_tensor_backend_ids);
free(sched->tensor_backend_id); free(sched->hv_tensor_copies);
free(sched->tensor_copies);
free(sched->node_backend_ids); free(sched->node_backend_ids);
free(sched->leaf_backend_ids); free(sched->leaf_backend_ids);
free(sched->prev_node_backend_ids); free(sched->prev_node_backend_ids);
free(sched->prev_leaf_backend_ids); free(sched->prev_leaf_backend_ids);
free(sched->context_buffer);
free(sched->graph.nodes);
free(sched->graph.leafs);
free(sched); free(sched);
} }
void ggml_backend_sched_reset(ggml_backend_sched_t sched) { void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
// reset state for the next run // reset state for the next run
if (!sched->is_reset) { if (!sched->is_reset) {
size_t hash_size = sched->hash_set.size; ggml_hash_set_reset(&sched->hash_set);
memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT memset(sched->hv_tensor_backend_ids, -1, sched->hash_set.size * sizeof(sched->hv_tensor_backend_ids[0]));
memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); memset(sched->hv_tensor_copies, 0, sched->hash_set.size * sched->n_backends * sched->n_copies * sizeof(struct ggml_tensor *));
memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
sched->is_reset = true; sched->is_reset = true;
} }
sched->is_alloc = false; sched->is_alloc = false;
} }
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes); GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes + measure_graph->n_leafs);
ggml_backend_sched_split_graph(sched, measure_graph); ggml_backend_sched_split_graph(sched, measure_graph);
// TODO: extract this to a separate function if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
return false; return false;
} }
@ -1936,10 +1943,11 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
} }
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes); GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
ggml_backend_sched_split_graph(sched, graph); ggml_backend_sched_split_graph(sched, graph);
if (!ggml_backend_sched_alloc_splits(sched)) { if (!ggml_backend_sched_alloc_splits(sched)) {
return false; return false;
} }
@ -2009,6 +2017,7 @@ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct gg
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
tensor_backend_id(node) = backend_index; tensor_backend_id(node) = backend_index;
SET_CAUSE(node, "usr"); SET_CAUSE(node, "usr");
sched->is_reset = false;
} }
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
@ -2051,9 +2060,9 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
GGML_ASSERT(src != NULL); GGML_ASSERT(src != NULL);
GGML_ASSERT(src->data && "graph must be allocated"); GGML_ASSERT(src->data && "graph must be allocated");
size_t id = ggml_hash_insert(hash_set, src); size_t id = ggml_hash_insert(&hash_set, src);
if (id == GGML_HASHTABLE_ALREADY_EXISTS) { if (id == GGML_HASHSET_ALREADY_EXISTS) {
return node_copies[ggml_hash_find(hash_set, src)]; return node_copies[ggml_hash_find(&hash_set, src)];
} }
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src); struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
@ -2078,7 +2087,7 @@ static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set,
return dst; return dst;
} }
static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { static void graph_copy_init_tensor(struct ggml_hash_set * hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
size_t id = ggml_hash_find(hash_set, src); size_t id = ggml_hash_find(hash_set, src);
if (node_init[id]) { if (node_init[id]) {
return; return;
@ -2105,10 +2114,7 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te
} }
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
struct ggml_hash_set hash_set = { struct ggml_hash_set hash_set = ggml_hash_set_new(graph->visited_hash_set.size);
/* .size = */ graph->visited_hash_table.size,
/* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT
};
struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT
bool * node_init = calloc(hash_set.size, sizeof(node_init[0])); bool * node_init = calloc(hash_set.size, sizeof(node_init[0]));
@ -2123,7 +2129,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
if (ctx_allocated == NULL || ctx_unallocated == NULL) { if (ctx_allocated == NULL || ctx_unallocated == NULL) {
fprintf(stderr, "failed to allocate context for graph copy\n"); fprintf(stderr, "failed to allocate context for graph copy\n");
free(hash_set.keys); ggml_hash_set_free(&hash_set);
free(node_copies); free(node_copies);
free(node_init); free(node_init);
ggml_free(ctx_allocated); ggml_free(ctx_allocated);
@ -2146,7 +2152,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
if (buffer == NULL) { if (buffer == NULL) {
fprintf(stderr, "failed to allocate buffer for graph copy\n"); fprintf(stderr, "failed to allocate buffer for graph copy\n");
free(hash_set.keys); ggml_hash_set_free(&hash_set);
free(node_copies); free(node_copies);
free(node_init); free(node_init);
ggml_free(ctx_allocated); ggml_free(ctx_allocated);
@ -2164,19 +2170,19 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s
// copy data and init views // copy data and init views
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
graph_copy_init_tensor(hash_set, node_copies, node_init, node); graph_copy_init_tensor(&hash_set, node_copies, node_init, node);
} }
// build graph copy // build graph copy
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false); struct ggml_cgraph * graph_copy = ggml_new_graph_custom(ctx_allocated, graph->size, false);
for (int i = 0; i < graph->n_nodes; i++) { for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i]; struct ggml_tensor * node = graph->nodes[i];
struct ggml_tensor * node_copy = node_copies[ggml_hash_find(hash_set, node)]; struct ggml_tensor * node_copy = node_copies[ggml_hash_find(&hash_set, node)];
graph_copy->nodes[i] = node_copy; graph_copy->nodes[i] = node_copy;
} }
graph_copy->n_nodes = graph->n_nodes; graph_copy->n_nodes = graph->n_nodes;
free(hash_set.keys); ggml_hash_set_free(&hash_set);
free(node_copies); free(node_copies);
free(node_init); free(node_init);

View file

@ -275,8 +275,7 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
break; break;
default: default:
fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node)); GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
GGML_ASSERT(false);
} }
} }

View file

@ -120,7 +120,7 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
file, line); file, line);
GGML_CANN_LOG_ERROR(" %s\n", stmt); GGML_CANN_LOG_ERROR(" %s\n", stmt);
// abort with GGML_ASSERT to get a stack trace // abort with GGML_ASSERT to get a stack trace
GGML_ASSERT(!"CANN error"); GGML_ABORT("CANN error");
} }
/** /**
@ -342,7 +342,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
// memory should always buffered. these memory may still needed by // memory should always buffered. these memory may still needed by
// tasks in stream. // tasks in stream.
// TODO, fix me. // TODO, fix me.
GGML_ASSERT(!"Cann buffer pool full, increase MAX_CANN_BUFFERS\n"); GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n");
} }
}; };
@ -1559,23 +1559,18 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
return false; return false;
} }
// need open both directions for memcpyasync between devices.
ggml_cann_set_device(cann_ctx_dst->device);
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0));
ggml_cann_set_device(cann_ctx_src->device); ggml_cann_set_device(cann_ctx_src->device);
ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0)); ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0));
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
ACL_MEMCPY_DEVICE_TO_DEVICE, ACL_MEMCPY_DEVICE_TO_DEVICE,
cann_ctx_dst->stream())); cann_ctx_src->stream()));
// record event on src stream //TODO: workaround for Event didn`t work here.
if (!cann_ctx_src->copy_event) { aclrtSynchronizeStream(cann_ctx_src->stream());
ACL_CHECK(aclrtCreateEvent(&cann_ctx_src->copy_event));
}
ACL_CHECK(
aclrtRecordEvent(cann_ctx_src->copy_event, cann_ctx_src->stream()));
// wait on dst stream for the copy to complete
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx_dst->stream(),
cann_ctx_src->copy_event));
} else { } else {
// src and dst are on the same backend // src and dst are on the same backend
ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size,
@ -1763,8 +1758,8 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
* *
* This function determines whether the CANN backend supports the given backend * This function determines whether the CANN backend supports the given backend
* buffer type by comparing the device context of the backend and buffer type. * buffer type by comparing the device context of the backend and buffer type.
* It returns true if the device associated with the buffer type matches the * It returns true if the devices are same between the backend context and
* device associated with the backend. * buffer type context.
* *
* @param backend Pointer to the CANN backend. * @param backend Pointer to the CANN backend.
* @param buft Pointer to the backend buffer type to check. * @param buft Pointer to the backend buffer type to check.
@ -1773,9 +1768,14 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
*/ */
GGML_CALL static bool ggml_backend_cann_supports_buft( GGML_CALL static bool ggml_backend_cann_supports_buft(
ggml_backend_t backend, ggml_backend_buffer_type_t buft) { ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cann_buffer_type_name; if (ggml_backend_buft_is_cann(buft)) {
ggml_backend_cann_context * cann_ctx =
GGML_UNUSED(backend); (ggml_backend_cann_context *)backend->context;
ggml_backend_cann_buffer_type_context * buft_ctx =
(ggml_backend_cann_buffer_type_context *)buft->context;
return buft_ctx->device == cann_ctx->device;
}
return false;
} }
/** /**
@ -1874,7 +1874,7 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
(aclrtEvent)event->context)); (aclrtEvent)event->context));
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }

View file

@ -844,7 +844,7 @@ void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_cann_max_pool2d(ctx, dst); ggml_cann_max_pool2d(ctx, dst);
break; break;
case GGML_OP_POOL_COUNT: case GGML_OP_POOL_COUNT:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} }
@ -931,9 +931,9 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb); ((ggml_tensor*)dst->extra)->nb);
return; return;
} }
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
if (dst->type == GGML_TYPE_F32) { if (dst->type == GGML_TYPE_F32) {
if (ggml_are_same_shape(src, dst)) { if (ggml_are_same_shape(src, dst)) {
@ -955,12 +955,12 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb); ((ggml_tensor*)dst->extra)->nb);
return; return;
} }
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
// TODO // TODO
GGML_ASSERT(false); GGML_ABORT("fatal error");
} else if (src->type == GGML_TYPE_F32) { } else if (src->type == GGML_TYPE_F32) {
// TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
// && nb0 == type_size) // && nb0 == type_size)
@ -991,10 +991,10 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb); ((ggml_tensor*)dst->extra)->nb);
return; return;
} }
GGML_ASSERT(false); GGML_ABORT("fatal error");
} else { } else {
// TODO: dst not contiguous // TODO: dst not contiguous
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
if (dst->type == GGML_TYPE_F16) { if (dst->type == GGML_TYPE_F16) {
@ -1017,11 +1017,11 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb); ((ggml_tensor*)dst->extra)->nb);
return; return;
} }
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
// TODO // TODO
GGML_ASSERT(false); GGML_ABORT("fatal error");
} else { } else {
if (ggml_are_same_shape(src, dst)) { if (ggml_are_same_shape(src, dst)) {
cann_copy(ctx, acl_src, acl_dst); cann_copy(ctx, acl_src, acl_dst);
@ -1029,7 +1029,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ACL_CHECK(aclDestroyTensor(acl_dst)); ACL_CHECK(aclDestroyTensor(acl_dst));
return; return;
} }
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -2219,7 +2219,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)dst->extra)->nb); ((ggml_tensor*)dst->extra)->nb);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} }
@ -2492,7 +2492,7 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_cann_mul_mat_q8_0(ctx, dst); ggml_cann_mul_mat_q8_0(ctx, dst);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} }

View file

@ -19,7 +19,11 @@ typedef half2 ggml_half2;
#define GGML_COMMON_DECL #define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CUDA) #elif defined(GGML_COMMON_DECL_CUDA)
#if defined(GGML_COMMON_DECL_MUSA)
#include <musa_fp16.h>
#else
#include <cuda_fp16.h> #include <cuda_fp16.h>
#endif
#include <cstdint> #include <cstdint>
typedef half ggml_half; typedef half ggml_half;
@ -415,7 +419,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
#define GGML_TABLE_END() }; #define GGML_TABLE_END() };
#define GGML_COMMON_IMPL #define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) #elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA)
#include <cstdint> #include <cstdint>
#define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = { #define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {

View file

@ -98,7 +98,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line); GGML_CUDA_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, file, line);
GGML_CUDA_LOG_ERROR(" %s\n", stmt); GGML_CUDA_LOG_ERROR(" %s\n", stmt);
// abort with GGML_ASSERT to get a stack trace // abort with GGML_ASSERT to get a stack trace
GGML_ASSERT(!"CUDA error"); GGML_ABORT("CUDA error");
} }
// this is faster on Windows // this is faster on Windows
@ -167,7 +167,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) { for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0; int device_vmm = 0;
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
CUdevice device; CUdevice device;
CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGet(&device, id));
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@ -179,7 +179,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
alloc_prop.location.id = id; alloc_prop.location.id = id;
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
} }
#endif // !defined(GGML_USE_HIPBLAS) #endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
info.devices[id].vmm = !!device_vmm; info.devices[id].vmm = !!device_vmm;
cudaDeviceProp prop; cudaDeviceProp prop;
@ -315,7 +315,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
}; };
// pool with virtual memory // pool with virtual memory
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
struct ggml_cuda_pool_vmm : public ggml_cuda_pool { struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
@ -409,14 +409,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used)); GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
} }
}; };
#endif // !defined(GGML_USE_HIPBLAS) #endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) { std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
if (ggml_cuda_info().devices[device].vmm) { if (ggml_cuda_info().devices[device].vmm) {
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device)); return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
} }
#endif #endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) && !defined(GGML_USE_MUSA)
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device)); return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
} }
@ -1341,7 +1341,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
static cudaError_t ggml_cuda_Memcpy2DPeerAsync( static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) { void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
#if !defined(GGML_USE_HIPBLAS) #if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
cudaMemcpy3DPeerParms p = {}; cudaMemcpy3DPeerParms p = {};
p.dstDevice = dstDevice; p.dstDevice = dstDevice;
@ -1355,7 +1355,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
GGML_UNUSED(dstDevice); GGML_UNUSED(dstDevice);
GGML_UNUSED(srcDevice); GGML_UNUSED(srcDevice);
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream); return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
#endif // !defined(GGML_USE_HIPBLAS) #endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
} }
static void ggml_cuda_op_mul_mat( static void ggml_cuda_op_mul_mat(
@ -1596,7 +1596,7 @@ static void ggml_cuda_op_mul_mat(
CUDA_CHECK(ggml_cuda_cpy_tensor_2d( CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
if (quantize_src1 && !src1_is_contiguous) { if (quantize_src1 && !src1_is_contiguous) {
@ -1828,6 +1828,9 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
} }
} }
#else #else
#ifdef GGML_USE_MUSA
GGML_ASSERT(false);
#else // !GGML_USE_MUSA
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) { if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
// there is no broadcast and src0, src1 are contiguous across dims 2, 3 // there is no broadcast and src0, src1 are contiguous across dims 2, 3
// use cublasGemmStridedBatchedEx // use cublasGemmStridedBatchedEx
@ -1870,6 +1873,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
cu_compute_type, cu_compute_type,
CUBLAS_GEMM_DEFAULT_TENSOR_OP)); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
} }
#endif // GGML_USE_MUSA
#endif #endif
if (dst->op_params[0] == GGML_PREC_DEFAULT) { if (dst->op_params[0] == GGML_PREC_DEFAULT) {
@ -2945,7 +2949,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event)); CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event));
#endif #endif
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -3027,7 +3031,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size
return false; return false;
} }
#if CUDART_VERSION >= 11100 #if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
if (err != cudaSuccess) { if (err != cudaSuccess) {
// clear the error // clear the error

View file

@ -81,7 +81,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
} else if (order == GGML_SORT_ORDER_DESC) { } else if (order == GGML_SORT_ORDER_DESC) {
k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad); k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }

View file

@ -259,7 +259,7 @@ static void ggml_cuda_op_bin_bcast(
} else { } else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }

View file

@ -12,6 +12,10 @@
#else #else
#define GGML_COMMON_DECL_CUDA #define GGML_COMMON_DECL_CUDA
#define GGML_COMMON_IMPL_CUDA #define GGML_COMMON_IMPL_CUDA
#if defined(GGML_USE_MUSA)
#define GGML_COMMON_DECL_MUSA
#define GGML_COMMON_IMPL_MUSA
#endif
#endif #endif
#include "ggml-common.h" #include "ggml-common.h"
@ -114,6 +118,150 @@
#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED #define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR #define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED #define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
#elif defined(GGML_USE_MUSA)
#include <musa_runtime.h>
#include <musa.h>
#include <mublas.h>
#include <musa_fp16.h>
// XXX: Keep the following order the same as hipBLAS
// #define CUBLAS_COMPUTE_16F MUBLAS_COMPUTE_16F
// #define CUBLAS_COMPUTE_32F MUBLAS_COMPUTE_32F
#define CUBLAS_COMPUTE_32F_FAST_16F MUBLAS_COMPUTE_32F_FAST_16F
#define CUBLAS_GEMM_DEFAULT MUBLAS_GEMM_DEFAULT
#define CUBLAS_GEMM_DEFAULT_TENSOR_OP MUBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N MUBLAS_OP_N
#define CUBLAS_OP_T MUBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
// #define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F MUSA_R_16F
#define CUDA_R_32F MUSA_R_32F
// #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
// #define cublasComputeType_t mublasComputeType_t
#define cublasCreate mublasCreate
#define cublasDestroy mublasDestroy
#define cublasGemmEx mublasGemmEx
#define cublasGemmBatchedEx mublasGemmBatchedEx
#define cublasGemmStridedBatchedEx mublasGemmStridedBatchedEx
#define cublasHandle_t mublasHandle_t
// #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetMathMode mublasSetMathMode
#define cublasSetStream mublasSetStream
#define cublasSgemm mublasSgemm
#define cublasStatus_t mublasStatus_t
#define cudaDataType_t musaDataType_t //deprecated, new hipblasDatatype not in 5.6
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess musaDeviceEnablePeerAccess
#define cudaDeviceProp musaDeviceProp
#define cudaDeviceSynchronize musaDeviceSynchronize
#define cudaError_t musaError_t
#define cudaErrorPeerAccessAlreadyEnabled musaErrorPeerAccessAlreadyEnabled
#define cudaErrorPeerAccessNotEnabled musaErrorPeerAccessNotEnabled
#define cudaEventCreateWithFlags musaEventCreateWithFlags
#define cudaEventDisableTiming musaEventDisableTiming
#define cudaEventRecord musaEventRecord
#define cudaEventSynchronize musaEventSynchronize
#define cudaEvent_t musaEvent_t
#define cudaEventDestroy musaEventDestroy
#define cudaFree musaFree
#define cudaFreeHost musaFreeHost
#define cudaGetDevice musaGetDevice
#define cudaGetDeviceCount musaGetDeviceCount
#define cudaGetDeviceProperties musaGetDeviceProperties
#define cudaGetErrorString musaGetErrorString
#define cudaGetLastError musaGetLastError
#define cudaHostRegister musaHostRegister
#define cudaHostRegisterPortable musaHostRegisterPortable
#define cudaHostRegisterReadOnly musaHostRegisterReadOnly
#define cudaHostUnregister musaHostUnregister
#define cudaLaunchHostFunc musaLaunchHostFunc
#define cudaMalloc musaMalloc
#define cudaMallocHost musaMallocHost
#define cudaMemcpy musaMemcpy
#define cudaMemcpyAsync musaMemcpyAsync
#define cudaMemcpyPeerAsync musaMemcpyPeerAsync
#define cudaMemcpy2DAsync musaMemcpy2DAsync
#define cudaMemcpyDeviceToDevice musaMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost musaMemcpyDeviceToHost
#define cudaMemcpyHostToDevice musaMemcpyHostToDevice
#define cudaMemcpyKind musaMemcpyKind
#define cudaMemset musaMemset
#define cudaMemsetAsync musaMemsetAsync
#define cudaMemGetInfo musaMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize musaOccupancyMaxPotentialBlockSize
#define cudaSetDevice musaSetDevice
#define cudaStreamCreateWithFlags musaStreamCreateWithFlags
#define cudaStreamDestroy musaStreamDestroy
#define cudaStreamFireAndForget musaStreamFireAndForget
#define cudaStreamNonBlocking musaStreamNonBlocking
#define cudaStreamPerThread musaStreamPerThread
#define cudaStreamSynchronize musaStreamSynchronize
#define cudaStreamWaitEvent musaStreamWaitEvent
#define cudaStream_t musaStream_t
#define cudaSuccess musaSuccess
// XXX: Other CUDA => MUSA mapping
#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE MU_MEM_ACCESS_FLAGS_PROT_READWRITE
#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED MU_MEM_ALLOC_GRANULARITY_RECOMMENDED
#define CU_MEM_ALLOCATION_TYPE_PINNED MU_MEM_ALLOCATION_TYPE_PINNED
#define CU_MEM_LOCATION_TYPE_DEVICE MU_MEM_LOCATION_TYPE_DEVICE
#define CUdevice MUdevice
#define CUdeviceptr MUdeviceptr
#define CUmemAccessDesc MUmemAccessDesc
#define CUmemAllocationProp MUmemAllocationProp
#define CUmemGenericAllocationHandle MUmemGenericAllocationHandle
#define cuDeviceGet muDeviceGet
#define cuDeviceGetAttribute muDeviceGetAttribute
#define cuMemAddressFree muMemAddressFree
#define cuMemAddressReserve muMemAddressReserve
#define cuMemCreate muMemCreate
#define cuMemGetAllocationGranularity muMemGetAllocationGranularity
#define cuMemMap muMemMap
#define cuMemRelease muMemRelease
#define cuMemSetAccess muMemSetAccess
#define cuMemUnmap muMemUnmap
#define cudaFuncAttributeMaxDynamicSharedMemorySize musaFuncAttributeMaxDynamicSharedMemorySize
#define cudaFuncSetAttribute musaFuncSetAttribute
#define cudaMemcpy3DPeerParms musaMemcpy3DPeerParms
#define make_cudaExtent make_musaExtent
#define make_cudaPitchedPtr make_musaPitchedPtr
// XXX: USE_CUDA_GRAPH
#define CUDA_SUCCESS MUSA_SUCCESS
#define CUresult MUresult
#define cuGetErrorString muGetErrorString
#define cudaErrorGraphExecUpdateFailure musaErrorGraphExecUpdateFailure
#define cudaErrorInvalidDeviceFunction musaErrorInvalidDeviceFunction
#define cudaGraphDestroy musaGraphDestroy
#define cudaGraphExecDestroy musaGraphExecDestroy
#define cudaGraphExec_t musaGraphExec_t
#define cudaGraphExecUpdate musaGraphExecUpdate
#define cudaGraphExecUpdateResultInfo musaGraphExecUpdateResult
#define cudaGraphGetNodes musaGraphGetNodes
#define cudaGraphInstantiate musaGraphInstantiate
#define cudaGraphKernelNodeGetParams musaGraphKernelNodeGetParams
#define cudaGraphKernelNodeSetParams musaGraphKernelNodeSetParams
#define cudaGraphLaunch musaGraphLaunch
#define cudaGraphNodeGetType musaGraphNodeGetType
#define cudaGraphNode_t musaGraphNode_t
#define cudaGraphNodeType musaGraphNodeType
#define cudaGraphNodeTypeKernel musaGraphNodeTypeKernel
#define cudaGraph_t musaGraph_t
#define cudaKernelNodeParams musaKernelNodeParams
#define cudaStreamCaptureModeRelaxed musaStreamCaptureModeRelaxed
#define cudaStreamEndCapture musaStreamEndCapture
// XXX: cuBLAS => muBLAS mapping
#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED MU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
#define CUBLAS_COMPUTE_16F CUDA_R_16F
#define CUBLAS_COMPUTE_32F CUDA_R_32F
#define cublasComputeType_t cudaDataType_t
// XXX: Clang builtins mapping
#define __vsub4 __vsub4_musa
#define __vcmpeq4 __vcmpeq4_musa
#define __vcmpne4 __vcmpne4_musa
#else #else
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cuda.h> #include <cuda.h>
@ -168,9 +316,13 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) #define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString)
#if CUDART_VERSION >= 12000 #if CUDART_VERSION >= 12000 || defined(GGML_USE_MUSA)
static const char * cublas_get_error_str(const cublasStatus_t err) { static const char * cublas_get_error_str(const cublasStatus_t err) {
#ifndef GGML_USE_MUSA
return cublasGetStatusString(err); return cublasGetStatusString(err);
#else
return mublasStatus_to_string(err);
#endif // GGML_USE_MUSA
} }
#else #else
static const char * cublas_get_error_str(const cublasStatus_t err) { static const char * cublas_get_error_str(const cublasStatus_t err) {
@ -200,7 +352,7 @@ static const char * cu_get_error_str(CUresult err) {
#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) #define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str)
#endif #endif
#if CUDART_VERSION >= 11100 #if CUDART_VERSION >= 11100 || defined(GGML_USE_MUSA)
#define GGML_CUDA_ASSUME(x) __builtin_assume(x) #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
#else #else
#define GGML_CUDA_ASSUME(x) #define GGML_CUDA_ASSUME(x)
@ -214,6 +366,42 @@ typedef float dfloat; // dequantize float
typedef float2 dfloat2; typedef float2 dfloat2;
#endif //GGML_CUDA_F16 #endif //GGML_CUDA_F16
#if defined(GGML_USE_MUSA)
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4)));
static __device__ __forceinline__ int __vsub4_musa(const int a, const int b) {
return __vsubss4(a, b);
}
static __device__ __forceinline__ unsigned int __vcmpeq4_musa(unsigned int a, unsigned int b) {
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
unsigned int c;
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
#pragma unroll
for (int i = 0; i < 4; ++i) {
vc[i] = va[i] == vb[i] ? 0xff : 0x00;
}
return c;
}
static __device__ __forceinline__ unsigned int __vcmpne4_musa(unsigned int a, unsigned int b) {
const uint8x4_t& va = reinterpret_cast<const uint8x4_t&>(a);
const uint8x4_t& vb = reinterpret_cast<const uint8x4_t&>(b);
unsigned int c;
uint8x4_t& vc = reinterpret_cast<uint8x4_t&>(c);
#pragma unroll
for (int i = 0; i < 4; ++i) {
vc[i] = va[i] == vb[i] ? 0x00 : 0xff;
}
return c;
}
#endif // defined(GGML_USE_MUSA)
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300 #define __CUDA_ARCH__ 1300
@ -348,7 +536,7 @@ static __device__ void no_device_code(
#ifdef __CUDA_ARCH__ #ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__)) #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else #else
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.") #define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__ #endif // __CUDA_ARCH__
static __device__ __forceinline__ float warp_reduce_sum(float x) { static __device__ __forceinline__ float warp_reduce_sum(float x) {
@ -455,7 +643,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b))); const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
return mask_low | mask_high; return mask_low | mask_high;
} }
#endif // CUDART_VERSION < 12000 #endif // CUDART_VERSION < CUDART_HMASK
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) { static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)

View file

@ -451,7 +451,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
} else { } else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type)); ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -484,6 +484,6 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
} else { } else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type)); ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }

View file

@ -662,7 +662,7 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }

View file

@ -564,7 +564,7 @@ static void on_no_fattn_vec_case(const int D) {
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n"); fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
fprintf(stderr, "By default only f16 KV cache is supported.\n"); fprintf(stderr, "By default only f16 KV cache is supported.\n");
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n"); fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
GGML_ASSERT(false); GGML_ABORT("fatal error");
} else if (D == 128) { } else if (D == 128) {
fprintf(stderr, "Unsupported KV type combination for head_size 128.\n"); fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
fprintf(stderr, "Supported combinations:\n"); fprintf(stderr, "Supported combinations:\n");
@ -572,11 +572,11 @@ static void on_no_fattn_vec_case(const int D) {
fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n"); fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n");
fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n"); fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n");
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n"); fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
GGML_ASSERT(false); GGML_ABORT("fatal error");
} else { } else {
fprintf(stderr, "Unsupported KV type combination for head_size 256.\n"); fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");
fprintf(stderr, "Only f16 is supported.\n"); fprintf(stderr, "Only f16 is supported.\n");
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }

View file

@ -287,7 +287,7 @@ void launch_fattn_tile_f16_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break; } break;
default: { default: {
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
} break; } break;
} }
} }

View file

@ -284,7 +284,7 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true); launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
} break; } break;
default: { default: {
GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128."); GGML_ABORT("FlashAttention without tensor cores only supports head sizes 64 and 128.");
} break; } break;
} }
} }

View file

@ -38,7 +38,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst); ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, float>(ctx, dst);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} else { } else {
@ -63,7 +63,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
// ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst); // ggml_cuda_flash_attn_ext_wmma_f16_case<128, cols_per_block, float>(ctx, dst);
// break; // break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} }
@ -86,7 +86,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
return; return;
@ -114,7 +114,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
return; return;
@ -141,7 +141,7 @@ static void ggml_cuda_flash_attn_ext_wmma_f16(ggml_backend_cuda_context & ctx, g
ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst); ggml_cuda_flash_attn_ext_wmma_f16_case<256, cols_per_block, half>(ctx, dst);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} }

View file

@ -171,8 +171,7 @@ void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
break; break;
default: default:
// TODO: k-quants // TODO: k-quants
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); GGML_ABORT("%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
GGML_ASSERT(false);
break; break;
} }
} }

View file

@ -84,7 +84,7 @@ void ggml_cuda_op_mul_mat_q(
mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream); mul_mat_q_case<GGML_TYPE_IQ4_NL>(ctx, args, stream);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }

View file

@ -75,7 +75,7 @@ static mmq_q8_1_ds_layout mmq_get_q8_1_ds_layout(const ggml_type type_x) {
case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_NL:
return MMQ_Q8_1_DS_LAYOUT_D4; return MMQ_Q8_1_DS_LAYOUT_D4;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} }
@ -2898,7 +2898,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
break; break;
default: default:
fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best); fprintf(stderr, "mmq_x_best=%d\n", mmq_x_best);
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} }

View file

@ -162,7 +162,7 @@ static void mul_mat_vec_q_cuda(
rows_per_cuda_block = 2; rows_per_cuda_block = 2;
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} }
@ -196,7 +196,7 @@ static void mul_mat_vec_q_cuda(
mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); mul_mat_vec_q<type, 8><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} }
@ -413,7 +413,7 @@ void ggml_cuda_op_mul_mat_vec_q(
mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }

View file

@ -163,7 +163,7 @@ void quantize_mmq_q8_1_cuda(
<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded); <<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} }

View file

@ -251,7 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
attn_factor, corr_dims, freq_factors, stream attn_factor, corr_dims, freq_factors, stream
); );
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} else { } else {
if (src0->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32) {
@ -265,7 +265,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
attn_factor, corr_dims, freq_factors, stream attn_factor, corr_dims, freq_factors, stream
); );
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
} }

View file

@ -634,21 +634,121 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif #endif
#define GGML_HASHTABLE_FULL ((size_t)-1) // bitset
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
#define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
#define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
static size_t ggml_bitset_size(size_t n) {
return (n + BITSET_MASK) >> BITSET_SHR;
}
static inline bool ggml_bitset_get(const ggml_bitset_t * bitset, size_t i) {
return !!(bitset[i >> BITSET_SHR] & (1u << (i & BITSET_MASK)));
}
static inline void ggml_bitset_set(ggml_bitset_t * bitset, size_t i) {
bitset[i >> BITSET_SHR] |= (1u << (i & BITSET_MASK));
}
static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
bitset[i >> BITSET_SHR] &= ~(1u << (i & BITSET_MASK));
}
// hash set
#define GGML_HASHSET_FULL ((size_t)-1)
#define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
struct ggml_hash_set ggml_hash_set_new(size_t size); struct ggml_hash_set ggml_hash_set_new(size_t size);
void ggml_hash_set_free(struct ggml_hash_set * hash_set);
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key); // returns the minimum size for a hash set that can hold min_sz elements
size_t ggml_hash_size(size_t min_sz);
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted // remove all elements from the hash set
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key); void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full // returns true if key is in the hash set
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key); static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
// return index, asserts if table is full // return index, asserts if table is full
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key); static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
// hash function for ggml_tensor
static inline size_t ggml_hash(const struct ggml_tensor * p) {
// the last 4 bits are always zero due to alignment
return (size_t)(uintptr_t)p >> 4;
}
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
size_t h = ggml_hash(key) % hash_set->size;
// linear probing
size_t i = h;
while (ggml_bitset_get(hash_set->used, i) && hash_set->keys[i] != key) {
i = (i + 1) % hash_set->size;
if (i == h) {
// visited all hash table entries -> not found
return GGML_HASHSET_FULL;
}
}
return i;
}
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
size_t i = ggml_hash_find(hash_set, key);
return i != GGML_HASHSET_FULL && ggml_bitset_get(hash_set->used, i);
}
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
size_t h = ggml_hash(key) % hash_set->size;
// linear probing
size_t i = h;
do {
if (!ggml_bitset_get(hash_set->used, i)) {
ggml_bitset_set(hash_set->used, i);
hash_set->keys[i] = key;
return i;
}
if (hash_set->keys[i] == key) {
return GGML_HASHSET_ALREADY_EXISTS;
}
i = (i + 1) % hash_set->size;
} while (i != h);
// visited all hash table entries -> not found
GGML_ABORT("fatal error");
}
static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
size_t h = ggml_hash(key) % hash_set->size;
// linear probing
size_t i = h;
do {
if (!ggml_bitset_get(hash_set->used, i)) {
ggml_bitset_set(hash_set->used, i);
hash_set->keys[i] = key;
return i;
}
if (hash_set->keys[i] == key) {
return i;
}
i = (i + 1) % hash_set->size;
} while (i != h);
// visited all hash table entries -> not found
GGML_ABORT("fatal error");
}
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -566,7 +566,7 @@ uint32_t safe_divide(uint32_t a, uint32_t b) {
} }
if ((a % b) != 0) { if ((a % b) != 0) {
fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b); fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b);
GGML_ASSERT(!"safe_divide result would've had remainder"); GGML_ABORT("safe_divide result would've had remainder");
} }
return a / b; return a / b;
} }
@ -1460,7 +1460,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
if (!ggml_vk_supports_op(dst)) { if (!ggml_vk_supports_op(dst)) {
fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); fprintf(stderr, "%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
GGML_ASSERT(!"unsupported op"); GGML_ABORT("unsupported op");
} }
const int32_t ne00 = src0 ? src0->ne[0] : 0; const int32_t ne00 = src0 ? src0->ne[0] : 0;
@ -1562,7 +1562,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
default: default:
{ {
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
} break; } break;
@ -1745,7 +1745,7 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
continue; continue;
not_implemented: {} not_implemented: {}
fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
//GGML_ASSERT(false); //GGML_ABORT("fatal error");
} }
// Evaluate sequence // Evaluate sequence

View file

@ -869,7 +869,7 @@ static enum ggml_status ggml_metal_graph_compute(
NSError * error = nil; NSError * error = nil;
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) { if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]); GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
GGML_ASSERT(!"capture failed"); GGML_ABORT("capture failed");
} }
} }
@ -931,7 +931,7 @@ static enum ggml_status ggml_metal_graph_compute(
if (!ggml_metal_supports_op(ctx, dst)) { if (!ggml_metal_supports_op(ctx, dst)) {
GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst)); GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
GGML_ASSERT(!"unsupported op"); GGML_ABORT("unsupported op");
} }
if (should_capture) { if (should_capture) {
@ -1068,7 +1068,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break; case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break; case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break; case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
default: GGML_ASSERT(false); default: GGML_ABORT("fatal error");
} }
bcast_row = true; bcast_row = true;
@ -1077,7 +1077,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break; case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break; case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break; case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
default: GGML_ASSERT(false); default: GGML_ABORT("fatal error");
} }
} }
@ -1131,7 +1131,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_F16].pipeline; break;
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I32].pipeline; break;
case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break; case GGML_TYPE_I16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_REPEAT_I16].pipeline; break;
default: GGML_ASSERT(false); default: GGML_ABORT("fatal error");
} }
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
@ -1387,7 +1387,7 @@ static enum ggml_status ggml_metal_graph_compute(
default: default:
{ {
GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} break; } break;
case GGML_OP_SQR: case GGML_OP_SQR:
@ -1609,7 +1609,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break; case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break;
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break;
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); default: GGML_ABORT("MUL MAT-MAT not implemented");
} }
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
@ -1782,7 +1782,7 @@ static enum ggml_status ggml_metal_graph_compute(
default: default:
{ {
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
GGML_ASSERT(false && "not implemented"); GGML_ABORT("not implemented");
} }
}; };
@ -1911,7 +1911,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break; case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break;
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break;
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); default: GGML_ABORT("MUL_MAT_ID not implemented");
} }
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
@ -2078,7 +2078,7 @@ static enum ggml_status ggml_metal_graph_compute(
default: default:
{ {
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
GGML_ASSERT(false && "not implemented"); GGML_ABORT("not implemented");
} }
}; };
@ -2178,7 +2178,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break;
case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break;
case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break;
default: GGML_ASSERT(false && "not implemented"); default: GGML_ABORT("not implemented");
} }
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
@ -2316,13 +2316,13 @@ static enum ggml_status ggml_metal_graph_compute(
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break; case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NORM_F16].pipeline; break;
default: GGML_ASSERT(false); default: GGML_ABORT("fatal error");
}; };
} else { } else {
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break; case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_NEOX_F16].pipeline; break;
default: GGML_ASSERT(false); default: GGML_ABORT("fatal error");
}; };
} }
@ -2399,7 +2399,7 @@ static enum ggml_status ggml_metal_graph_compute(
switch (dst->type) { switch (dst->type) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break; case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
default: GGML_ASSERT(false); default: GGML_ABORT("fatal error");
}; };
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
@ -2556,7 +2556,7 @@ static enum ggml_status ggml_metal_graph_compute(
switch (order) { switch (order) {
case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break; case GGML_SORT_ORDER_ASC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline; break;
case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break; case GGML_SORT_ORDER_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
default: GGML_ASSERT(false); default: GGML_ABORT("fatal error");
}; };
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
@ -2645,7 +2645,7 @@ static enum ggml_status ggml_metal_graph_compute(
{ {
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
GGML_METAL_LOG_ERROR("add template specialization for this size\n"); GGML_METAL_LOG_ERROR("add template specialization for this size\n");
GGML_ASSERT(false && "add template specialization for this size"); GGML_ABORT("add template specialization for this size");
} }
} }
} else { } else {
@ -2658,7 +2658,7 @@ static enum ggml_status ggml_metal_graph_compute(
{ {
GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00);
GGML_METAL_LOG_ERROR("add template specialization for this size\n"); GGML_METAL_LOG_ERROR("add template specialization for this size\n");
GGML_ASSERT(false && "add template specialization for this size"); GGML_ABORT("add template specialization for this size");
} }
} }
} }
@ -2779,7 +2779,7 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break; case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break; case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL].pipeline; break;
default: GGML_ASSERT(false && "not implemented"); default: GGML_ABORT("not implemented");
}; };
} break; } break;
case GGML_TYPE_F16: case GGML_TYPE_F16:
@ -2787,10 +2787,10 @@ static enum ggml_status ggml_metal_graph_compute(
switch (dstt) { switch (dstt) {
case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break; case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
default: GGML_ASSERT(false && "not implemented"); default: GGML_ABORT("not implemented");
}; };
} break; } break;
default: GGML_ASSERT(false && "not implemented"); default: GGML_ABORT("not implemented");
} }
[encoder setComputePipelineState:pipeline]; [encoder setComputePipelineState:pipeline];
@ -2818,7 +2818,7 @@ static enum ggml_status ggml_metal_graph_compute(
default: default:
{ {
GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }

View file

@ -4190,15 +4190,18 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3); sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
#endif #endif
for (; ib < nb; ++ib) { for (; ib < nb; ++ib) {
int sumi = 0; int sumi0 = 0;
int sumi1 = 0;
for (int j = 0; j < qk/2; ++j) { for (int j = 0; j < qk/2; ++j) {
const int v0 = (x[ib].qs[j] & 0x0F) - 8; const int v0 = (x[ib].qs[j] & 0x0F) - 8;
const int v1 = (x[ib].qs[j] >> 4) - 8; const int v1 = (x[ib].qs[j] >> 4) - 8;
sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]); sumi0 += (v0 * y[ib].qs[j]);
sumi1 += (v1 * y[ib].qs[j + qk/2]);
} }
int sumi = sumi0 + sumi1;
sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d); sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
} }
@ -4474,15 +4477,18 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
sumf = hsum_float_8(acc) + summs; sumf = hsum_float_8(acc) + summs;
#endif #endif
for (; ib < nb; ++ib) { for (; ib < nb; ++ib) {
int sumi = 0; int sumi0 = 0;
int sumi1 = 0;
for (int j = 0; j < qk/2; ++j) { for (int j = 0; j < qk/2; ++j) {
const int v0 = (x[ib].qs[j] & 0x0F); const int v0 = (x[ib].qs[j] & 0x0F);
const int v1 = (x[ib].qs[j] >> 4); const int v1 = (x[ib].qs[j] >> 4);
sumi += (v0 * y[ib].qs[j]) + (v1 * y[ib].qs[j + qk/2]); sumi0 += (v0 * y[ib].qs[j]);
sumi1 += (v1 * y[ib].qs[j + qk/2]);
} }
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
} }
@ -4823,18 +4829,21 @@ void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, size_t bs, const void * r
uint32_t qh; uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh)); memcpy(&qh, x[ib].qh, sizeof(qh));
int sumi = 0; int sumi0 = 0;
int sumi1 = 0;
for (int j = 0; j < qk/2; ++j) { for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4; const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12)); const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
const int32_t x0 = ((x[ib].qs[j] & 0x0F) | xh_0) - 16; const int32_t x0 = (int8_t)(((x[ib].qs[j] & 0x0F) | xh_0) - 16);
const int32_t x1 = ((x[ib].qs[j] >> 4) | xh_1) - 16; const int32_t x1 = (int8_t)(((x[ib].qs[j] >> 4) | xh_1) - 16);
sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]); sumi0 += (x0 * y[ib].qs[j]);
sumi1 += (x1 * y[ib].qs[j + qk/2]);
} }
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi; sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d)) * sumi;
} }
@ -5194,7 +5203,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
uint32_t qh; uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh)); memcpy(&qh, x[ib].qh, sizeof(qh));
int sumi = 0; int sumi0 = 0;
int sumi1 = 0;
for (int j = 0; j < qk/2; ++j) { for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10; const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
@ -5203,9 +5213,11 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0; const int32_t x0 = (x[ib].qs[j] & 0xF) | xh_0;
const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1; const int32_t x1 = (x[ib].qs[j] >> 4) | xh_1;
sumi += (x0 * y[ib].qs[j]) + (x1 * y[ib].qs[j + qk/2]); sumi0 += (x0 * y[ib].qs[j]);
sumi1 += (x1 * y[ib].qs[j + qk/2]);
} }
int sumi = sumi0 + sumi1;
sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s); sumf += (GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d))*sumi + GGML_FP16_TO_FP32(x[ib].m)*GGML_FP16_TO_FP32(y[ib].s);
} }
@ -12692,7 +12704,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
printf("Oops: found point %u not on grid:", u); printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
printf("\n"); printf("\n");
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
q2[2*ib+0] |= ((uint32_t) grid_index << 8*k); q2[2*ib+0] |= ((uint32_t) grid_index << 8*k);
q2[2*ib+1] |= (block_signs[k] << 7*k); q2[2*ib+1] |= (block_signs[k] << 7*k);
@ -12871,7 +12883,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
printf("Oops: found point %u not on grid:", u); printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
printf("\n"); printf("\n");
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
q2[2*ib+k] = grid_index | (block_signs[k] << 9); q2[2*ib+k] = grid_index | (block_signs[k] << 9);
} }
@ -13314,7 +13326,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
printf("Oops: found point %u not on grid:", u); printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]); for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
printf("\n"); printf("\n");
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
if (grid_size == 256) { if (grid_size == 256) {
q3[8*ib+k] = grid_index; q3[8*ib+k] = grid_index;
@ -13527,7 +13539,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
printf("Oops: found point %u not on grid:", u); printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]); for (int i = 0; i < 4; ++i) printf(" %d", L[4*k+i]);
printf("\n"); printf("\n");
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
qs[k] = grid_index & 255; qs[k] = grid_index & 255;
qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8)); qh[(ib*bs4+k)/8] |= ((grid_index >> 8) << ((ib*bs4+k)%8));
@ -14503,7 +14515,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
printf("Oops: found point %u not on grid:", u); printf("Oops: found point %u not on grid:", u);
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]); for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
printf("\n"); printf("\n");
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
const int i8 = 2*ib + k; const int i8 = 2*ib + k;
y[ibl].qs[i8] = grid_index & 255; y[ibl].qs[i8] = grid_index & 255;
@ -14623,7 +14635,7 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
} }
if (nbytes % ggml_type_size(type) != 0) { if (nbytes % ggml_type_size(type) != 0) {
fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type); fprintf(stderr, "%s: invalid size %zu for type %s (type size = %zu)\n", __func__, nbytes, ggml_type_name(type), ggml_type_size(type));
return false; return false;
} }

View file

@ -1723,7 +1723,7 @@ static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
}); });
}); });
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -2075,8 +2075,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
// GGML_SYCL_DEBUG("current device index %d\n", id); // GGML_SYCL_DEBUG("current device index %d\n", id);
src_ptr = (char *) extra->data_device[id]; src_ptr = (char *) extra->data_device[id];
} else { } else {
// GGML_SYCL_DEBUG("GGML_ASSERT(false)\n"); // GGML_SYCL_DEBUG("GGML_ABORT("fatal error")\n");
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
char * dst_ptr = (char *) dst; char * dst_ptr = (char *) dst;
@ -2163,7 +2163,7 @@ static void ggml_sycl_op_get_rows(ggml_backend_sycl_context & ctx, const ggml_te
default: default:
// TODO: k-quants // TODO: k-quants
fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} }
@ -2192,7 +2192,7 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t
} else { } else {
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -2476,7 +2476,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
case GGML_TYPE_Q6_K: case GGML_TYPE_Q6_K:
return 64; return 64;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -3101,7 +3101,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
SYCL_CHECK(ggml_sycl_cpy_tensor_2d( SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
if (convert_src1_to_q8_1 && !src1_is_contiguous) { if (convert_src1_to_q8_1 && !src1_is_contiguous) {
@ -3896,7 +3896,7 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
} else { } else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type)); ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
(void) dst; (void) dst;

View file

@ -100,7 +100,7 @@ static void crash() {
const char* msg) { const char* msg) {
fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg); fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
fprintf(stderr, " in function %s at %s:%d\n", func, file, line); fprintf(stderr, " in function %s at %s:%d\n", func, file, line);
GGML_ASSERT(!"SYCL error"); GGML_ABORT("SYCL error");
} }
#define SYCL_CHECK(err) \ #define SYCL_CHECK(err) \

View file

@ -1011,7 +1011,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
break; break;
default: default:
printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type);
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }

View file

@ -975,7 +975,7 @@ namespace dpct
if (backend == "opencl:cpu") return 4; if (backend == "opencl:cpu") return 4;
if (backend == "opencl:acc") return 5; if (backend == "opencl:acc") return 5;
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
static bool compare_backend(std::string &backend1, std::string &backend2) { static bool compare_backend(std::string &backend1, std::string &backend2) {
return convert_backend_index(backend1) < convert_backend_index(backend2); return convert_backend_index(backend1) < convert_backend_index(backend2);

View file

@ -1799,7 +1799,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q4_0_PASCAL; mmq_y = MMQ_Y_Q4_0_PASCAL;
nwarps = NWARPS_Q4_0_PASCAL; nwarps = NWARPS_Q4_0_PASCAL;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -1914,7 +1914,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q4_1_PASCAL; mmq_y = MMQ_Y_Q4_1_PASCAL;
nwarps = NWARPS_Q4_1_PASCAL; nwarps = NWARPS_Q4_1_PASCAL;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2029,7 +2029,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q5_0_PASCAL; mmq_y = MMQ_Y_Q5_0_PASCAL;
nwarps = NWARPS_Q5_0_PASCAL; nwarps = NWARPS_Q5_0_PASCAL;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2144,7 +2144,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q5_1_PASCAL; mmq_y = MMQ_Y_Q5_1_PASCAL;
nwarps = NWARPS_Q5_1_PASCAL; nwarps = NWARPS_Q5_1_PASCAL;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2259,7 +2259,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q8_0_PASCAL; mmq_y = MMQ_Y_Q8_0_PASCAL;
nwarps = NWARPS_Q8_0_PASCAL; nwarps = NWARPS_Q8_0_PASCAL;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2374,7 +2374,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q2_K_PASCAL; mmq_y = MMQ_Y_Q2_K_PASCAL;
nwarps = NWARPS_Q2_K_PASCAL; nwarps = NWARPS_Q2_K_PASCAL;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2497,7 +2497,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q3_K_PASCAL; mmq_y = MMQ_Y_Q3_K_PASCAL;
nwarps = NWARPS_Q3_K_PASCAL; nwarps = NWARPS_Q3_K_PASCAL;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2625,7 +2625,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q4_K_PASCAL; mmq_y = MMQ_Y_Q4_K_PASCAL;
nwarps = NWARPS_Q4_K_PASCAL; nwarps = NWARPS_Q4_K_PASCAL;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2746,7 +2746,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q5_K_PASCAL; mmq_y = MMQ_Y_Q5_K_PASCAL;
nwarps = NWARPS_Q5_K_PASCAL; nwarps = NWARPS_Q5_K_PASCAL;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -2867,7 +2867,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
mmq_y = MMQ_Y_Q6_K_PASCAL; mmq_y = MMQ_Y_Q6_K_PASCAL;
nwarps = NWARPS_Q6_K_PASCAL; nwarps = NWARPS_Q6_K_PASCAL;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
@ -3016,7 +3016,7 @@ void ggml_sycl_op_mul_mat_q(
ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }

View file

@ -1017,7 +1017,7 @@ void ggml_sycl_op_mul_mat_vec_q(
mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream);
break; break;
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
break; break;
} }
} }

View file

@ -251,7 +251,7 @@ void ggml_sycl_op_rope(
attn_factor, corr_dims, freq_factors, main_stream attn_factor, corr_dims, freq_factors, main_stream
); );
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} else { } else {
if (src0->type == GGML_TYPE_F32) { if (src0->type == GGML_TYPE_F32) {
@ -265,7 +265,7 @@ void ggml_sycl_op_rope(
attn_factor, corr_dims, freq_factors, main_stream attn_factor, corr_dims, freq_factors, main_stream
); );
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }

View file

@ -236,8 +236,8 @@ struct vk_device_struct {
}; };
struct vk_buffer_struct { struct vk_buffer_struct {
vk::Buffer buffer; vk::Buffer buffer = VK_NULL_HANDLE;
vk::DeviceMemory device_memory; vk::DeviceMemory device_memory = VK_NULL_HANDLE;
vk::MemoryPropertyFlags memory_property_flags; vk::MemoryPropertyFlags memory_property_flags;
void * ptr; void * ptr;
size_t size = 0; size_t size = 0;
@ -1961,7 +1961,7 @@ void ggml_vk_instance_init() {
// Make sure at least one device exists // Make sure at least one device exists
if (devices.empty()) { if (devices.empty()) {
std::cerr << "ggml_vulkan: Error: No devices found." << std::endl; std::cerr << "ggml_vulkan: Error: No devices found." << std::endl;
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
// Default to using all dedicated GPUs // Default to using all dedicated GPUs
@ -2459,7 +2459,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
// Buffer is already mapped // Buffer is already mapped
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl; std::cerr << "ggml_vulkan: buffer_write_nc_async dst buffer is host_visible. Use synchronous write." << std::endl;
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
// Check if src is pinned memory // Check if src is pinned memory
vk_buffer buf; vk_buffer buf;
@ -2527,7 +2527,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont
staging = ctx->device->sync_staging; staging = ctx->device->sync_staging;
staging_offset = 0; staging_offset = 0;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -2563,7 +2563,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
// Buffer is already mapped // Buffer is already mapped
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl; std::cerr << "ggml_vulkan: buffer_write_async dst buffer is host_visible. Use synchronous write." << std::endl;
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
// Check if src is pinned memory // Check if src is pinned memory
vk_buffer buf = nullptr; vk_buffer buf = nullptr;
@ -2602,7 +2602,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context * subctx, vk_buffer& dst, s
staging_buffer = dst->device->sync_staging; staging_buffer = dst->device->sync_staging;
staging_offset = 0; staging_offset = 0;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -2704,7 +2704,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context * subctx, vk_buffer& src, si
staging_buffer = src->device->sync_staging; staging_buffer = src->device->sync_staging;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -2913,7 +2913,7 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_
} }
std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl; std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) { static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out) {
@ -3499,7 +3499,7 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context *
const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig;
if (mmp == nullptr) { if (mmp == nullptr) {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
// Not implemented // Not implemented
@ -4078,7 +4078,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
std::cerr << " and " << ggml_type_name(src1->type); std::cerr << " and " << ggml_type_name(src1->type);
} }
std::cerr << " to " << ggml_type_name(dst->type) << std::endl; std::cerr << " to " << ggml_type_name(dst->type) << std::endl;
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
op_func(ctx, subctx, src0, src1, dst); op_func(ctx, subctx, src0, src1, dst);
@ -4521,7 +4521,7 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0
} else if (type == GGML_TYPE_F16) { } else if (type == GGML_TYPE_F16) {
val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0)); val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0));
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
fprintf(stderr, "% 7.2f ", val); fprintf(stderr, "% 7.2f ", val);
} else { } else {
@ -4555,7 +4555,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
p = ctx->device->pipeline_matmul_f16->a_s; p = ctx->device->pipeline_matmul_f16->a_s;
shname = "F16_ALIGNED_S"; shname = "F16_ALIGNED_S";
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} else if (shader_size == 1) { } else if (shader_size == 1) {
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) { if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
@ -4571,7 +4571,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
p = ctx->device->pipeline_matmul_f16->a_m; p = ctx->device->pipeline_matmul_f16->a_m;
shname = "F16_ALIGNED_M"; shname = "F16_ALIGNED_M";
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} else if (shader_size == 2) { } else if (shader_size == 2) {
if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) { if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
@ -4587,7 +4587,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
p = ctx->device->pipeline_matmul_f16->a_l; p = ctx->device->pipeline_matmul_f16->a_l;
shname = "F16_ALIGNED_L"; shname = "F16_ALIGNED_L";
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} else { } else {
GGML_ASSERT(0); GGML_ASSERT(0);
@ -4668,7 +4668,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
} else if (std::is_same<ggml_fp16_t, X_TYPE>()) { } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); x[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
for (size_t i = 0; i < y_ne; i++) { for (size_t i = 0; i < y_ne; i++) {
@ -4679,7 +4679,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
// y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f); // y[i] = ggml_fp32_to_fp16((rand() / (float)RAND_MAX) * 2.0f - 1.0f);
y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f); y[i] = ggml_fp32_to_fp16((i % k == i / k) ? 1.0f : 0.0f);
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -4727,14 +4727,14 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
} else if (std::is_same<ggml_fp16_t, X_TYPE>()) { } else if (std::is_same<ggml_fp16_t, X_TYPE>()) {
src0_type = GGML_TYPE_F16; src0_type = GGML_TYPE_F16;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
if (std::is_same<float, Y_TYPE>()) { if (std::is_same<float, Y_TYPE>()) {
src1_type = GGML_TYPE_F32; src1_type = GGML_TYPE_F32;
} else if (std::is_same<ggml_fp16_t, Y_TYPE>()) { } else if (std::is_same<ggml_fp16_t, Y_TYPE>()) {
src1_type = GGML_TYPE_F16; src1_type = GGML_TYPE_F16;
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch); ggml_tensor * src0_ggml = ggml_new_tensor_3d(ggml_ctx, src0_type, k, m, batch);
@ -4841,7 +4841,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1
} else if (tensor->type == GGML_TYPE_F16) { } else if (tensor->type == GGML_TYPE_F16) {
val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]));
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
fprintf(stderr, "% 7.2f ", val); fprintf(stderr, "% 7.2f ", val);
} else { } else {
@ -5391,7 +5391,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) {
std::cerr << std::endl; std::cerr << std::endl;
} }
GGML_ASSERT(false); GGML_ABORT("fatal error");
#endif #endif
if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) {
@ -5486,7 +5486,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
break; break;
default: default:
std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl;
GGML_ASSERT(false); GGML_ABORT("fatal error");
return; return;
} }
@ -6498,7 +6498,7 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d
} else if (tensor->type == GGML_TYPE_I32) { } else if (tensor->type == GGML_TYPE_I32) {
val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); val = *(const int32_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]);
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
fprintf(stderr, "% 7.2f ", val); fprintf(stderr, "% 7.2f ", val);
} else { } else {
@ -6620,7 +6620,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS);
} }
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
@ -6662,7 +6662,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS);
} }
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
@ -6720,7 +6720,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS); memcpy(src2_clone->nb, src2->nb, sizeof(size_t) * GGML_MAX_DIMS);
} }
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
if (vk_output_tensor > 0 && vk_output_tensor == check_counter) { if (vk_output_tensor > 0 && vk_output_tensor == check_counter) {
@ -6797,7 +6797,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
break; break;
default: default:
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) { } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
if (src1 == nullptr) { if (src1 == nullptr) {
@ -6825,7 +6825,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor *
tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone); tensor_clone = ggml_sum_rows(ggml_ctx, src0_clone);
} else { } else {
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx); ggml_cgraph * cgraph = ggml_new_graph(ggml_ctx);
@ -6912,7 +6912,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
} }
} else { } else {
std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl; std::cerr << "Missing debug code for type " << ggml_type_name(tensor->type) << std::endl;
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) { if ((std::isnan(correct) != std::isnan(result)) || (std::isinf(correct) != std::isinf(result)) || !buffer_size_fit) {
@ -6935,7 +6935,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
std::cerr << std::endl; std::cerr << std::endl;
std::vector<const ggml_tensor *> done; std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done); ggml_vk_print_graph_origin(tensor, done);
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) { if (first_error[0] == -1 && std::fabs(correct - result) > 0.1f) {
first_error[0] = i0; first_error[0] = i0;
@ -7006,7 +7006,7 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor *
std::cerr << std::endl; std::cerr << std::endl;
std::vector<const ggml_tensor *> done; std::vector<const ggml_tensor *> done;
ggml_vk_print_graph_origin(tensor, done); ggml_vk_print_graph_origin(tensor, done);
GGML_ASSERT(false); GGML_ABORT("fatal error");
} else { } else {
std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl; std::cerr << check_counter << " " << tensor->name << " op=" << ggml_op_name(tensor->op) << " avg_err=" << avg_err << std::endl;
} }

File diff suppressed because it is too large Load diff

View file

@ -30,6 +30,20 @@
#define ASYNCIO_CONCURRENCY 64 #define ASYNCIO_CONCURRENCY 64
// define prototypes
void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str);
bool directory_exists(const std::string& path);
bool create_directory(const std::string& path);
std::string to_uppercase(const std::string& input);
bool string_ends_with(const std::string& str, const std::string& suffix);
std::string join_paths(const std::string& path1, const std::string& path2);
std::string basename(const std::string &path);
void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16);
std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b);
void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id);
void process_shaders(std::vector<std::future<void>>& tasks);
void write_output_files();
std::mutex lock; std::mutex lock;
std::vector<std::pair<std::string, std::string>> shader_fnames; std::vector<std::pair<std::string, std::string>> shader_fnames;
@ -38,7 +52,7 @@ std::string input_dir = "vulkan-shaders";
std::string output_dir = "/tmp"; std::string output_dir = "/tmp";
std::string target_hpp = "ggml-vulkan-shaders.hpp"; std::string target_hpp = "ggml-vulkan-shaders.hpp";
std::string target_cpp = "ggml-vulkan-shaders.cpp"; std::string target_cpp = "ggml-vulkan-shaders.cpp";
bool no_clean = false; bool clean = true;
const std::vector<std::string> type_names = { const std::vector<std::string> type_names = {
"f32", "f32",
@ -464,8 +478,9 @@ void write_output_files() {
} }
fprintf(src, "\n};\n\n"); fprintf(src, "\n};\n\n");
if (!no_clean) { if (clean) {
std::remove(path.c_str()); std::remove(path.c_str());
// fprintf(stderr, "Removed: %s\n", path.c_str());
} }
} }
@ -481,6 +496,18 @@ int main(int argc, char** argv) {
} }
} }
if (argc <= 1 || args.find("--help") != args.end()) {
std::cout << "Usage:\n"
"\tvulkan-shaders-gen [options]\n\n"
"Options:\n"
"\t--glslc <path> Path to glslc executable (default: /usr/bin/glslc)\n"
"\t--input-dir Directory containing shader sources (required)\n"
"\t--output-dir Output directory for generated SPIR-V files and optional C++ headers\n"
"\t--target-hpp <path> Path to generate a header file with shader declarations in C++ format\n"
"\t--target-cpp <path> Path to generate a source code file implementing the declared shaders (optional)\n"
"\t--no-clean Keep temporary SPIR-V files after build (default: remove them)\n";
return EXIT_SUCCESS;
}
if (args.find("--glslc") != args.end()) { if (args.find("--glslc") != args.end()) {
GLSLC = args["--glslc"]; // Path to glslc GLSLC = args["--glslc"]; // Path to glslc
} }
@ -497,7 +524,7 @@ int main(int argc, char** argv) {
target_cpp = args["--target-cpp"]; // Path to generated cpp file target_cpp = args["--target-cpp"]; // Path to generated cpp file
} }
if (args.find("--no-clean") != args.end()) { if (args.find("--no-clean") != args.end()) {
no_clean = true; // Keep temporary SPIR-V files in output-dir after build clean = false; // Keep temporary SPIR-V files in output-dir after build
} }
if (!directory_exists(input_dir)) { if (!directory_exists(input_dir)) {

View file

@ -33,17 +33,15 @@
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
#define LLAMA_MAX_RNG_STATE (64*1024)
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq' #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 7 #define LLAMA_SESSION_VERSION 8
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ #define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
#define LLAMA_STATE_SEQ_VERSION 1 #define LLAMA_STATE_SEQ_VERSION 2
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -691,10 +689,11 @@ extern "C" {
// State / sessions // State / sessions
// //
// Returns the maximum size in bytes of the state (rng, logits, embedding // Returns the *actual* size in bytes of the state
// and kv_cache) - will often be smaller after compacting tokens // (rng, logits, embedding and kv_cache)
LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx); // Only use when saving the state, not when restoring it, otherwise the size may be too small.
LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx), LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
"use llama_state_get_size instead"); "use llama_state_get_size instead");
// Copies the state to the specified destination address. // Copies the state to the specified destination address.
@ -702,7 +701,8 @@ extern "C" {
// Returns the number of bytes copied // Returns the number of bytes copied
LLAMA_API size_t llama_state_get_data( LLAMA_API size_t llama_state_get_data(
struct llama_context * ctx, struct llama_context * ctx,
uint8_t * dst); uint8_t * dst,
size_t size);
LLAMA_API DEPRECATED(size_t llama_copy_state_data( LLAMA_API DEPRECATED(size_t llama_copy_state_data(
struct llama_context * ctx, struct llama_context * ctx,
uint8_t * dst), uint8_t * dst),
@ -712,7 +712,8 @@ extern "C" {
// Returns the number of bytes read // Returns the number of bytes read
LLAMA_API size_t llama_state_set_data( LLAMA_API size_t llama_state_set_data(
struct llama_context * ctx, struct llama_context * ctx,
const uint8_t * src); const uint8_t * src,
size_t size);
LLAMA_API DEPRECATED(size_t llama_set_state_data( LLAMA_API DEPRECATED(size_t llama_set_state_data(
struct llama_context * ctx, struct llama_context * ctx,
const uint8_t * src), const uint8_t * src),
@ -754,6 +755,7 @@ extern "C" {
LLAMA_API size_t llama_state_seq_get_data( LLAMA_API size_t llama_state_seq_get_data(
struct llama_context * ctx, struct llama_context * ctx,
uint8_t * dst, uint8_t * dst,
size_t size,
llama_seq_id seq_id); llama_seq_id seq_id);
// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
@ -763,6 +765,7 @@ extern "C" {
LLAMA_API size_t llama_state_seq_set_data( LLAMA_API size_t llama_state_seq_set_data(
struct llama_context * ctx, struct llama_context * ctx,
const uint8_t * src, const uint8_t * src,
size_t size,
llama_seq_id dest_seq_id); llama_seq_id dest_seq_id);
LLAMA_API size_t llama_state_seq_save_file( LLAMA_API size_t llama_state_seq_save_file(

View file

@ -102,6 +102,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
# cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake # cmake/FindSIMD.cmake -> ggml/cmake/FindSIMD.cmake
# #
# src/ggml.c -> ggml/src/ggml.c # src/ggml.c -> ggml/src/ggml.c
# src/ggml-aarch64.c -> ggml/src/ggml-aarch64.c
# src/ggml-aarch64.h -> ggml/src/ggml-aarch64.h
# src/ggml-alloc.c -> ggml/src/ggml-alloc.c # src/ggml-alloc.c -> ggml/src/ggml-alloc.c
# src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h # src/ggml-backend-impl.h -> ggml/src/ggml-backend-impl.h
# src/ggml-backend.c -> ggml/src/ggml-backend.c # src/ggml-backend.c -> ggml/src/ggml-backend.c
@ -117,6 +119,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
# src/ggml-sycl/* -> ggml/src/ggml-sycl/ # src/ggml-sycl/* -> ggml/src/ggml-sycl/
# src/ggml-sycl.cpp -> ggml/src/ggml-sycl.cpp # src/ggml-sycl.cpp -> ggml/src/ggml-sycl.cpp
# src/ggml-vulkan.cpp -> ggml/src/ggml-vulkan.cpp # src/ggml-vulkan.cpp -> ggml/src/ggml-vulkan.cpp
# src/vulkan-shaders/* -> ggml/src/vulkan-shaders/
# #
# include/ggml.h -> ggml/include/ggml.h # include/ggml.h -> ggml/include/ggml.h
# include/ggml-alloc.h -> ggml/include/ggml-alloc.h # include/ggml-alloc.h -> ggml/include/ggml-alloc.h
@ -143,6 +146,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
-e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \ -e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
-e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \ -e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml\.c/\1ggml\/src\/ggml.c/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.c/\1ggml\/src\/ggml-aarch64.c/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-aarch64\.h/\1ggml\/src\/ggml-aarch64.h/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-alloc\.c/\1ggml\/src\/ggml-alloc.c/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend-impl\.h/\1ggml\/src\/ggml-backend-impl.h/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-backend\.c/\1ggml\/src\/ggml-backend.c/g' \
@ -158,6 +163,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
-e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\.cpp/\1ggml\/src\/ggml-sycl.cpp/g' \
-e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \ -e 's/([[:space:]]|[ab]\/)src\/ggml-vulkan\.cpp/\1ggml\/src\/ggml-vulkan.cpp/g' \
-e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml\.h/\1ggml\/include\/ggml.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-alloc\.h/\1ggml\/include\/ggml-alloc.h/g' \
-e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \ -e 's/([[:space:]]|[ab]\/)include\/ggml-backend\.h/\1ggml\/include\/ggml-backend.h/g' \

View file

@ -1 +1 @@
e3b3846976c94163f2b3dd128cc959782653edbb 31d544f87835a55602883fe09156bb85a4c163d8

View file

@ -5,6 +5,8 @@ cp -rpv ../ggml/src/CMakeLists.txt ./ggml/src/CMakeLists.txt
cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake cp -rpv ../ggml/cmake/FindSIMD.cmake ./ggml/cmake/FindSIMD.cmake
cp -rpv ../ggml/src/ggml.c ./ggml/src/ggml.c cp -rpv ../ggml/src/ggml.c ./ggml/src/ggml.c
cp -rpv ../ggml/src/ggml-aarch64.c ./ggml/src/ggml-aarch64.c
cp -rpv ../ggml/src/ggml-aarch64.h ./ggml/src/ggml-aarch64.h
cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c cp -rpv ../ggml/src/ggml-alloc.c ./ggml/src/ggml-alloc.c
cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml/src/ggml-backend-impl.h
cp -rpv ../ggml/src/ggml-backend.c ./ggml/src/ggml-backend.c cp -rpv ../ggml/src/ggml-backend.c ./ggml/src/ggml-backend.c
@ -21,6 +23,7 @@ cp -rpv ../ggml/src/ggml-rpc.cpp ./ggml/src/ggml-rpc.cpp
cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/ cp -rpv ../ggml/src/ggml-sycl/* ./ggml/src/ggml-sycl/
cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml/src/ggml-sycl.cpp cp -rpv ../ggml/src/ggml-sycl.cpp ./ggml/src/ggml-sycl.cpp
cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml/src/ggml-vulkan.cpp cp -rpv ../ggml/src/ggml-vulkan.cpp ./ggml/src/ggml-vulkan.cpp
cp -rpv ../ggml/src/vulkan-shaders/* ./ggml/src/vulkan-shaders/
cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h cp -rpv ../ggml/include/ggml.h ./ggml/include/ggml.h
cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h cp -rpv ../ggml/include/ggml-alloc.h ./ggml/include/ggml-alloc.h

View file

@ -221,7 +221,7 @@ static void llama_grammar_advance_stack(
// end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
// (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on // (LLAMA_GRETYPE_CHAR_ALT, LLAMA_GRETYPE_CHAR_RNG_UPPER); stack should never be left on
// those // those
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -517,7 +517,7 @@ void llama_grammar_accept_token_impl(struct llama_grammar * grammar, const struc
return; return;
} }
} }
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
const std::string & piece = vocab->cache_token_to_piece.at(token); const std::string & piece = vocab->cache_token_to_piece.at(token);

View file

@ -152,14 +152,14 @@ static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
return strtol(buf.c_str(), NULL, 16); return strtol(buf.c_str(), NULL, 16);
} }
case LLAMA_VOCAB_TYPE_BPE: { case LLAMA_VOCAB_TYPE_BPE: {
GGML_ASSERT(false); GGML_ABORT("fatal error");
return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT? //return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT?
} }
case LLAMA_VOCAB_TYPE_WPM: { case LLAMA_VOCAB_TYPE_WPM: {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -1396,7 +1396,7 @@ std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab,
} }
} break; } break;
case LLAMA_VOCAB_TYPE_NONE: case LLAMA_VOCAB_TYPE_NONE:
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
return output; return output;
@ -1422,7 +1422,7 @@ llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch) {
return vocab.token_to_id.at(unicode_byte_to_utf8(ch)); return vocab.token_to_id.at(unicode_byte_to_utf8(ch));
} }
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -1606,7 +1606,7 @@ int32_t llama_token_to_piece_impl(const struct llama_vocab & vocab, llama_token
break; break;
} }
default: default:
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }

File diff suppressed because it is too large Load diff

View file

@ -94,7 +94,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
// This is going to create some weird integers though. // This is going to create some weird integers though.
ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
@ -132,7 +132,7 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
tt.to_float(&buf[i], vq.data(), bs); tt.to_float(&buf[i], vq.data(), bs);
tv.insert(tv.end(), vq.begin(), vq.end()); tv.insert(tv.end(), vq.begin(), vq.end());
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
} }
@ -1435,7 +1435,7 @@ struct test_argsort : public test_case {
ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float)); ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(float));
} }
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }
} }
@ -2462,7 +2462,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
return true; return true;
} }
GGML_ASSERT(false); GGML_ABORT("fatal error");
return false; return false;
} }

View file

@ -166,12 +166,12 @@ static void test_sampler_queue(
for (auto s : samplers_sequence) { for (auto s : samplers_sequence) {
switch (s){ switch (s){
case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break; case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break;
case 'f': GGML_ASSERT(false && "tail_free test not implemented"); break; case 'f': GGML_ABORT("tail_free test not implemented"); break;
case 'y': GGML_ASSERT(false && "typical test not implemented"); break; case 'y': GGML_ABORT("typical test not implemented"); break;
case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break; case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break;
case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break; case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break;
case 't': GGML_ASSERT(false && "temperature test not implemented"); break; case 't': GGML_ABORT("temperature test not implemented"); break;
default : GGML_ASSERT(false && "Unknown sampler"); break; default : GGML_ABORT("Unknown sampler"); break;
} }
llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
@ -222,7 +222,7 @@ static void test_sampler_queue(
GGML_ASSERT(candidates_p.data[0].id == max_token_id); GGML_ASSERT(candidates_p.data[0].id == max_token_id);
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id); GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
} else { } else {
GGML_ASSERT(false); GGML_ABORT("fatal error");
} }
} }