Merge branch 'ggerganov:master' into master

This commit is contained in:
m3ndax 2023-08-02 10:28:12 +02:00 committed by GitHub
commit a51d1a416c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 1720 additions and 1297 deletions

View file

@ -68,12 +68,12 @@ option(LLAMA_ACCELERATE "llama: enable Accelerate framework
option(LLAMA_BLAS "llama: use BLAS" OFF) option(LLAMA_BLAS "llama: use BLAS" OFF)
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use CUDA" OFF) option(LLAMA_CUBLAS "llama: use CUDA" OFF)
option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) #option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels") set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF) option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF) option(LLAMA_METAL "llama: use Metal" OFF)
@ -253,9 +253,9 @@ if (LLAMA_CUBLAS)
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h) set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
add_compile_definitions(GGML_USE_CUBLAS) add_compile_definitions(GGML_USE_CUBLAS)
if (LLAMA_CUDA_CUBLAS) # if (LLAMA_CUDA_CUBLAS)
add_compile_definitions(GGML_CUDA_CUBLAS) # add_compile_definitions(GGML_CUDA_CUBLAS)
endif() # endif()
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y}) add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
if (LLAMA_CUDA_FORCE_DMMV) if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV)
@ -265,8 +265,8 @@ if (LLAMA_CUBLAS)
if (DEFINED LLAMA_CUDA_DMMV_Y) if (DEFINED LLAMA_CUDA_DMMV_Y)
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
endif() endif()
if (LLAMA_CUDA_DMMV_F16) if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
add_compile_definitions(GGML_CUDA_DMMV_F16) add_compile_definitions(GGML_CUDA_F16)
endif() endif()
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@ -277,10 +277,14 @@ if (LLAMA_CUBLAS)
endif() endif()
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_DMMV_F16) if (LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else() else()
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
endif() endif()
endif() endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

View file

@ -236,14 +236,14 @@ ifdef LLAMA_CUDA_MMQ_Y
else else
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64 NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
endif # LLAMA_CUDA_MMQ_Y endif # LLAMA_CUDA_MMQ_Y
ifdef LLAMA_CUDA_CUBLAS #ifdef LLAMA_CUDA_CUBLAS
NVCCFLAGS += -DGGML_CUDA_CUBLAS # NVCCFLAGS += -DGGML_CUDA_CUBLAS
endif # LLAMA_CUDA_CUBLAS #endif # LLAMA_CUDA_CUBLAS
ifdef LLAMA_CUDA_CCBIN ifdef LLAMA_CUDA_CCBIN
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
ifdef LLAMA_CLBLAST ifdef LLAMA_CLBLAST
@ -411,13 +411,13 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
tests/test-double-float: tests/test-double-float.c build-info.h ggml.o llama.o common.o $(OBJS) tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-grad0: tests/test-grad0.c build-info.h ggml.o llama.o common.o $(OBJS) tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-opt: tests/test-opt.c build-info.h ggml.o llama.o common.o $(OBJS) tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS) tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS)

View file

@ -80,7 +80,7 @@ as the main playground for developing new features for the [ggml](https://github
- [x] LLaMA 2 🦙🦙 - [x] LLaMA 2 🦙🦙
- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca) - [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all) - [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894) - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
@ -88,6 +88,7 @@ as the main playground for developing new features for the [ggml](https://github
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b) - [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM) - [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft)) - [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
- [X] [Aquila-7B](https://huggingface.co/BAAI/Aquila-7B) / [AquilaChat-7B](https://huggingface.co/BAAI/AquilaChat-7B)
**Bindings:** **Bindings:**
@ -400,9 +401,11 @@ Building the program with BLAS support may lead to some performance improvements
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
<!---
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
--->
| Option | Legal values | Default | Description | | Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|-------------| |-------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
| LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. | | LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. |
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
@ -490,6 +493,9 @@ Building the program with BLAS support may lead to some performance improvements
# obtain the original LLaMA model weights and place them in ./models # obtain the original LLaMA model weights and place them in ./models
ls ./models ls ./models
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model 65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
# [Optional] for models using BPE tokenizers
ls ./models
65B 30B 13B 7B vocab.json
# install Python dependencies # install Python dependencies
python3 -m pip install -r requirements.txt python3 -m pip install -r requirements.txt
@ -497,6 +503,9 @@ python3 -m pip install -r requirements.txt
# convert the 7B model to ggml FP16 format # convert the 7B model to ggml FP16 format
python3 convert.py models/7B/ python3 convert.py models/7B/
# [Optional] for models using BPE tokenizers
python convert.py models/7B/ --vocabtype bpe
# quantize the model to 4-bits (using q4_0 method) # quantize the model to 4-bits (using q4_0 method)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0 ./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0

View file

@ -352,7 +352,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
params.main_gpu = std::stoi(argv[i]); params.main_gpu = std::stoi(argv[i]);
#else #else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
#endif #endif
} else if (arg == "--tensor-split" || arg == "-ts") { } else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) { if (++i >= argc) {
@ -376,13 +376,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} }
} }
#else #else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--mul-mat-q" || arg == "-mmq") {
#ifdef GGML_USE_CUBLAS
params.mul_mat_q = true;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
} else if (arg == "--low-vram" || arg == "-lv") { } else if (arg == "--low-vram" || arg == "-lv") {
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
params.low_vram = true; params.low_vram = true;
#else #else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
} else if (arg == "--no-mmap") { } else if (arg == "--no-mmap") {
params.use_mmap = false; params.use_mmap = false;
@ -585,6 +591,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
#endif #endif
fprintf(stdout, " --mtest compute maximum memory usage\n"); fprintf(stdout, " --mtest compute maximum memory usage\n");
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n"); fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
@ -637,6 +646,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.main_gpu = params.main_gpu; lparams.main_gpu = params.main_gpu;
lparams.tensor_split = params.tensor_split; lparams.tensor_split = params.tensor_split;
lparams.low_vram = params.low_vram; lparams.low_vram = params.low_vram;
lparams.mul_mat_q = params.mul_mat_q;
lparams.seed = params.seed; lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16; lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap; lparams.use_mmap = params.use_mmap;

View file

@ -74,6 +74,7 @@ struct gpt_params {
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
bool mul_mat_q = false; // if true, use experimental mul_mat_q kernels
bool memory_f16 = true; // use f16 instead of f32 for memory kv bool memory_f16 = true; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs bool use_color = false; // use color to distinguish generations and inputs

View file

@ -163,7 +163,7 @@ node .
`content`: Set the text to tokenize. `content`: Set the text to tokenize.
Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`. Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does. - **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.

File diff suppressed because it is too large Load diff

View file

@ -3,12 +3,11 @@
<head> <head>
<meta charset="UTF-8"> <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" /> <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
<meta name="color-scheme" content="light dark">
<title>llama.cpp - chat</title> <title>llama.cpp - chat</title>
<style> <style>
body { body {
background-color: #fff;
color: #000;
font-family: system-ui; font-family: system-ui;
font-size: 90%; font-size: 90%;
} }

View file

@ -631,6 +631,9 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
#endif #endif
fprintf(stdout, " -m FNAME, --model FNAME\n"); fprintf(stdout, " -m FNAME, --model FNAME\n");
fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
@ -827,7 +830,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
} }
} }
#else #else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {}); LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
} }
else if (arg == "--low-vram" || arg == "-lv") else if (arg == "--low-vram" || arg == "-lv")
@ -835,7 +838,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
params.low_vram = true; params.low_vram = true;
#else #else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n"); LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
#endif // GGML_USE_CUBLAS
}
else if (arg == "--mul-mat-q" || arg == "-mmq")
{
#ifdef GGML_USE_CUBLAS
params.mul_mat_q = true;
#else
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
} }
else if (arg == "--main-gpu" || arg == "-mg") else if (arg == "--main-gpu" || arg == "-mg")

File diff suppressed because it is too large Load diff

View file

@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
void ggml_cuda_set_main_device(int main_device); void ggml_cuda_set_main_device(int main_device);
void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
void ggml_cuda_set_scratch_size(size_t scratch_size); void ggml_cuda_set_scratch_size(size_t scratch_size);
void ggml_cuda_free_scratch(void); void ggml_cuda_free_scratch(void);
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);

View file

@ -718,7 +718,8 @@ void ggml_metal_graph_compute(
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224 // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
GGML_ASSERT(ne00 == ne10); GGML_ASSERT(ne00 == ne10);
GGML_ASSERT(ne02 == ne12); // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
GGML_ASSERT(ne03 == ne13);
if (ggml_is_contiguous(src0) && if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) && ggml_is_contiguous(src1) &&
@ -746,11 +747,11 @@ void ggml_metal_graph_compute(
initWithDevice:ctx->device transposeLeft:false transposeRight:true initWithDevice:ctx->device transposeLeft:false transposeRight:true
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0]; resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
// we need to do ne02 multiplications // we need to do ne12 multiplications
// TODO: is there a way to do this in parallel - currently very slow .. // TODO: is there a way to do this in parallel - currently very slow ..
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS // TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
for (int64_t i02 = 0; i02 < ne02; ++i02) { for (int64_t i02 = 0; i02 < ne12; ++i02) {
size_t offs_src0_cur = offs_src0 + i02*nb02; size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
size_t offs_src1_cur = offs_src1 + i02*nb12; size_t offs_src1_cur = offs_src1 + i02*nb12;
size_t offs_dst_cur = offs_dst + i02*nb2; size_t offs_dst_cur = offs_dst + i02*nb2;
@ -772,8 +773,6 @@ void ggml_metal_graph_compute(
switch (src0t) { switch (src0t) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
GGML_ASSERT(ne02 == ne12);
nth0 = 64; nth0 = 64;
nth1 = 1; nth1 = 1;
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
@ -853,16 +852,18 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4]; [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6]; [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7]; [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8]; [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9]; [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10]; [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) { src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {

View file

@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
device float * dst, device float * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
constant int64_t & ne02,
constant uint64_t & nb00, constant uint64_t & nb00,
constant uint64_t & nb01, constant uint64_t & nb01,
constant uint64_t & nb02, constant uint64_t & nb02,
constant int64_t & ne10, constant int64_t & ne10,
constant int64_t & ne11, constant int64_t & ne11,
constant int64_t & ne12,
constant uint64_t & nb10, constant uint64_t & nb10,
constant uint64_t & nb11, constant uint64_t & nb11,
constant uint64_t & nb12, constant uint64_t & nb12,
@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
const int64_t r1 = tgpig.y; const int64_t r1 = tgpig.y;
const int64_t im = tgpig.z; const int64_t im = tgpig.z;
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02); device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12); device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
sum[tpitg.x] = 0.0f; sum[tpitg.x] = 0.0f;
@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
} }
} }
kernel void kernel_alibi_f32( kernel void kernel_alibi_f32(
device const float * src0, device const float * src0,
device float * dst, device float * dst,

View file

@ -901,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
/*.progress_callback =*/ nullptr, /*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr, /*.progress_callback_user_data =*/ nullptr,
/*.low_vram =*/ false, /*.low_vram =*/ false,
/*.mul_mat_q =*/ false,
/*.f16_kv =*/ true, /*.f16_kv =*/ true,
/*.logits_all =*/ false, /*.logits_all =*/ false,
/*.vocab_only =*/ false, /*.vocab_only =*/ false,
@ -1028,6 +1029,7 @@ static void llama_model_load_internal(
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
const float * tensor_split, const float * tensor_split,
const bool mul_mat_q,
float rope_freq_base, float rope_freq_base,
float rope_freq_scale, float rope_freq_scale,
bool low_vram, bool low_vram,
@ -1156,9 +1158,11 @@ static void llama_model_load_internal(
} }
(void) main_gpu; (void) main_gpu;
(void) mul_mat_q;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUBLAS)
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__); fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
ggml_cuda_set_main_device(main_gpu); ggml_cuda_set_main_device(main_gpu);
ggml_cuda_set_mul_mat_q(mul_mat_q);
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
@ -1367,6 +1371,7 @@ static bool llama_model_load(
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
const float * tensor_split, const float * tensor_split,
const bool mul_mat_q,
float rope_freq_base, float rope_freq_base,
float rope_freq_scale, float rope_freq_scale,
bool low_vram, bool low_vram,
@ -1377,7 +1382,8 @@ static bool llama_model_load(
llama_progress_callback progress_callback, llama_progress_callback progress_callback,
void *progress_callback_user_data) { void *progress_callback_user_data) {
try { try {
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type, llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
return true; return true;
} catch (const std::exception & err) { } catch (const std::exception & err) {
@ -1812,6 +1818,12 @@ static bool llama_eval_internal(
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads; n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
#if GGML_USE_MPI #if GGML_USE_MPI
const int64_t n_layer = hparams.n_layer; const int64_t n_layer = hparams.n_layer;
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer); ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
@ -1825,7 +1837,10 @@ static bool llama_eval_internal(
//} //}
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads); ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
ggml_metal_graph_compute(lctx.ctx_metal, gf); ggml_metal_graph_compute(lctx.ctx_metal, gf);
ggml_metal_get_tensor (lctx.ctx_metal, cur); ggml_metal_get_tensor (lctx.ctx_metal, res);
if (!lctx.embedding.empty()) {
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
}
} else { } else {
// IMPORTANT: // IMPORTANT:
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla // Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
@ -1856,12 +1871,6 @@ static bool llama_eval_internal(
// update kv token count // update kv token count
lctx.kv_self.n = n_past + N; lctx.kv_self.n = n_past + N;
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
if (cgraph_fname) { if (cgraph_fname) {
ggml_graph_export(gf, cgraph_fname); ggml_graph_export(gf, cgraph_fname);
} }
@ -3189,7 +3198,7 @@ struct llama_model * llama_load_model_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers, if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram, params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
params.progress_callback_user_data)) { params.progress_callback_user_data)) {
delete model; delete model;

View file

@ -108,6 +108,7 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
bool low_vram; // if true, reduce VRAM usage at the cost of performance bool low_vram; // if true, reduce VRAM usage at the cost of performance
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
bool f16_kv; // use fp16 for KV cache bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights bool vocab_only; // only load the vocabulary, no weights

View file

@ -10,5 +10,5 @@ cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
cp -rpv ../ggml/tests/test-opt.c ./tests/test-opt.c cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp
cp -rpv ../ggml/tests/test-grad0.c ./tests/test-grad0.c cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp

View file

@ -6,10 +6,10 @@ function(llama_add_test source)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN}) add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
endfunction() endfunction()
# llama_add_test(test-double-float.c) # SLOW # llama_add_test(test-double-float.cpp) # SLOW
llama_add_test(test-quantize-fns.cpp) llama_add_test(test-quantize-fns.cpp)
llama_add_test(test-quantize-perf.cpp) llama_add_test(test-quantize-perf.cpp)
llama_add_test(test-sampling.cpp) llama_add_test(test-sampling.cpp)
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
llama_add_test(test-grad0.c) # SLOW llama_add_test(test-grad0.cpp) # SLOW
# llama_add_test(test-opt.c) # SLOW # llama_add_test(test-opt.cpp) # SLOW

View file

@ -3,10 +3,11 @@
// This is done by checking all finite (non-NaN, non-infinite) floats. // This is done by checking all finite (non-NaN, non-infinite) floats.
#undef NDEBUG #undef NDEBUG
#include <assert.h> #include <cassert>
#include <immintrin.h> #include <immintrin.h>
#include <math.h> #include <cmath>
#include <stdint.h> #include <cstdint>
#include <cstring>
#pragma GCC diagnostic push #pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdouble-promotion" #pragma GCC diagnostic ignored "-Wdouble-promotion"
@ -32,8 +33,9 @@ inline static float silu_float(float x) {
int main(void) { int main(void) {
uint32_t x = UINT32_MAX; uint32_t x = UINT32_MAX;
do { do {
float f = *(float *)&x; float f;
assert(!isfinite(f) || (round_orig(f) == round_float(f))); memcpy(&f, &x, sizeof(x));
assert(!std::isfinite(f) || (round_orig(f) == round_float(f)));
} while (x--); } while (x--);
#ifdef __F16C__ #ifdef __F16C__

View file

@ -1,10 +1,10 @@
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows #define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
#include "ggml.h" #include "ggml.h"
#include <math.h> #include <cmath>
#include <stdio.h> #include <cstdio>
#include <stdlib.h> #include <cstdlib>
#include <assert.h> #include <cassert>
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
@ -47,16 +47,16 @@
#define GGML_PRINT(...) printf(__VA_ARGS__) #define GGML_PRINT(...) printf(__VA_ARGS__)
float frand(void) { static float frand(void) {
return (float)rand()/(float)RAND_MAX; return (float)rand()/(float)RAND_MAX;
} }
int irand(int n) { static int irand(int n) {
if (n == 0) return 0; if (n == 0) return 0;
return rand()%n; return rand()%n;
} }
void get_random_dims(int64_t * dims, int ndims) { static void get_random_dims(int64_t * dims, int ndims) {
dims[0] = dims[1] = dims[2] = dims[3] = 1; dims[0] = dims[1] = dims[2] = dims[3] = 1;
for (int i = 0; i < ndims; i++) { for (int i = 0; i < ndims; i++) {
@ -64,7 +64,7 @@ void get_random_dims(int64_t * dims, int ndims) {
} }
} }
struct ggml_tensor * get_random_tensor_f32( static struct ggml_tensor * get_random_tensor_f32(
struct ggml_context * ctx0, struct ggml_context * ctx0,
int ndims, int ndims,
int64_t ne[], int64_t ne[],
@ -112,7 +112,7 @@ struct ggml_tensor * get_random_tensor_f32(
return result; return result;
} }
struct ggml_tensor * get_random_tensor_f16( static struct ggml_tensor * get_random_tensor_f16(
struct ggml_context * ctx0, struct ggml_context * ctx0,
int ndims, int ndims,
int64_t ne[], int64_t ne[],
@ -160,7 +160,7 @@ struct ggml_tensor * get_random_tensor_f16(
return result; return result;
} }
struct ggml_tensor * get_random_tensor_i32( static struct ggml_tensor * get_random_tensor_i32(
struct ggml_context * ctx0, struct ggml_context * ctx0,
int ndims, int ndims,
int64_t ne[], int64_t ne[],
@ -208,7 +208,7 @@ struct ggml_tensor * get_random_tensor_i32(
return result; return result;
} }
void print_elements(const char* label, const struct ggml_tensor * t) { static void print_elements(const char* label, const struct ggml_tensor * t) {
if (!t) { if (!t) {
printf("%s: %s = null\n", __func__, label); printf("%s: %s = null\n", __func__, label);
return; return;
@ -228,7 +228,7 @@ void print_elements(const char* label, const struct ggml_tensor * t) {
} }
bool check_gradient( static bool check_gradient(
const char * op_name, const char * op_name,
struct ggml_context * ctx0, struct ggml_context * ctx0,
struct ggml_tensor * x[], struct ggml_tensor * x[],
@ -310,7 +310,7 @@ bool check_gradient(
} }
// TODO: clean-up this .. // TODO: clean-up this ..
bool check_mat_mul( static bool check_mat_mul(
const struct ggml_tensor * y, const struct ggml_tensor * y,
const struct ggml_tensor * x0, const struct ggml_tensor * x0,
const struct ggml_tensor * x1) { const struct ggml_tensor * x1) {
@ -373,9 +373,9 @@ bool check_mat_mul(
int main(int argc, const char ** argv) { int main(int argc, const char ** argv) {
struct ggml_init_params params = { struct ggml_init_params params = {
.mem_size = 128*1024*1024, /* .mem_size = */ 128*1024*1024,
.mem_buffer = NULL, /* .mem_buffer = */ NULL,
.no_alloc = false, /* .no_alloc = */ false,
}; };
int64_t ne[4]; int64_t ne[4];

View file

@ -1,9 +1,9 @@
#include "ggml.h" #include "ggml.h"
#include <math.h> #include <cmath>
#include <stdio.h> #include <cstdio>
#include <stdlib.h> #include <cstdlib>
#include <assert.h> #include <cassert>
#define MAX_NARGS 2 #define MAX_NARGS 2
@ -119,10 +119,11 @@ void set_element(struct ggml_tensor * t, int idx, float value) {
int main(void) { int main(void) {
struct ggml_init_params params = { struct ggml_init_params params = {
.mem_size = 1024*1024*1024, /* .mem_size = */ 1024*1024*1024,
.mem_buffer = NULL, /* .mem_buffer = */ NULL,
.no_alloc = false, /* .no_alloc = */ false,
}; };
struct ggml_context * ctx = ggml_init(params); struct ggml_context * ctx = ggml_init(params);
int64_t ne1[4] = {4, 128, 1, 1}; int64_t ne1[4] = {4, 128, 1, 1};