Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
f33b3dc306
19 changed files with 3643 additions and 1536 deletions
|
@ -67,11 +67,13 @@ endif()
|
||||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
|
||||||
|
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
||||||
|
set(LLAMA_CUDA_MMQ_Y "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
|
||||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||||
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||||
option(LLAMA_METAL "llama: use Metal" OFF)
|
option(LLAMA_METAL "llama: use Metal" OFF)
|
||||||
|
@ -251,6 +253,10 @@ if (LLAMA_CUBLAS)
|
||||||
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_CUBLAS)
|
||||||
|
# if (LLAMA_CUDA_CUBLAS)
|
||||||
|
# add_compile_definitions(GGML_CUDA_CUBLAS)
|
||||||
|
# endif()
|
||||||
|
add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
|
||||||
if (LLAMA_CUDA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||||
endif()
|
endif()
|
||||||
|
@ -259,8 +265,8 @@ if (LLAMA_CUBLAS)
|
||||||
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
||||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_CUDA_DMMV_F16)
|
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_F16)
|
add_compile_definitions(GGML_CUDA_F16)
|
||||||
endif()
|
endif()
|
||||||
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
|
|
||||||
|
@ -271,10 +277,14 @@ if (LLAMA_CUBLAS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||||
|
# 52 == lowest CUDA 12 standard
|
||||||
|
# 60 == f16 CUDA intrinsics
|
||||||
|
# 61 == integer CUDA intrinsics
|
||||||
|
# 70 == (assumed) compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
||||||
if (LLAMA_CUDA_DMMV_F16)
|
if (LLAMA_CUDA_DMMV_F16)
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "60;61") # needed for f16 CUDA intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||||
else()
|
else()
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
@ -497,6 +507,8 @@ endif()
|
||||||
add_library(ggml OBJECT
|
add_library(ggml OBJECT
|
||||||
ggml.c
|
ggml.c
|
||||||
ggml.h
|
ggml.h
|
||||||
|
ggml-alloc.c
|
||||||
|
ggml-alloc.h
|
||||||
${GGML_SOURCES_CUDA}
|
${GGML_SOURCES_CUDA}
|
||||||
${GGML_SOURCES_OPENCL}
|
${GGML_SOURCES_OPENCL}
|
||||||
${GGML_SOURCES_METAL}
|
${GGML_SOURCES_METAL}
|
||||||
|
|
24
Makefile
24
Makefile
|
@ -194,7 +194,7 @@ ifdef LLAMA_CUBLAS
|
||||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
|
||||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
NVCCFLAGS = --forward-unknown-to-host-compiler
|
NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
|
||||||
ifdef LLAMA_CUDA_NVCC
|
ifdef LLAMA_CUDA_NVCC
|
||||||
NVCC = $(LLAMA_CUDA_NVCC)
|
NVCC = $(LLAMA_CUDA_NVCC)
|
||||||
else
|
else
|
||||||
|
@ -220,19 +220,30 @@ else ifdef LLAMA_CUDA_DMMV_Y
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
|
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
|
||||||
endif # LLAMA_CUDA_MMV_Y
|
endif # LLAMA_CUDA_MMV_Y
|
||||||
|
ifdef LLAMA_CUDA_F16
|
||||||
|
NVCCFLAGS += -DGGML_CUDA_F16
|
||||||
|
endif # LLAMA_CUDA_F16
|
||||||
ifdef LLAMA_CUDA_DMMV_F16
|
ifdef LLAMA_CUDA_DMMV_F16
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
|
NVCCFLAGS += -DGGML_CUDA_F16
|
||||||
endif # LLAMA_CUDA_DMMV_F16
|
endif # LLAMA_CUDA_DMMV_F16
|
||||||
ifdef LLAMA_CUDA_KQUANTS_ITER
|
ifdef LLAMA_CUDA_KQUANTS_ITER
|
||||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||||
endif
|
endif
|
||||||
|
ifdef LLAMA_CUDA_MMQ_Y
|
||||||
|
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=$(LLAMA_CUDA_MMQ_Y)
|
||||||
|
else
|
||||||
|
NVCCFLAGS += -DGGML_CUDA_MMQ_Y=64
|
||||||
|
endif # LLAMA_CUDA_MMQ_Y
|
||||||
|
#ifdef LLAMA_CUDA_CUBLAS
|
||||||
|
# NVCCFLAGS += -DGGML_CUDA_CUBLAS
|
||||||
|
#endif # LLAMA_CUDA_CUBLAS
|
||||||
ifdef LLAMA_CUDA_CCBIN
|
ifdef LLAMA_CUDA_CCBIN
|
||||||
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
|
||||||
endif
|
endif
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(subst -Ofast,-O3,$(CXXFLAGS)) -Wno-pedantic -c $< -o $@
|
||||||
endif # LLAMA_CUBLAS
|
endif # LLAMA_CUBLAS
|
||||||
|
|
||||||
ifdef LLAMA_CLBLAST
|
ifdef LLAMA_CLBLAST
|
||||||
|
@ -318,7 +329,12 @@ $(info )
|
||||||
ggml.o: ggml.c ggml.h ggml-cuda.h
|
ggml.o: ggml.c ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
llama.o: llama.cpp ggml.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
|
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
||||||
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
OBJS += ggml-alloc.o
|
||||||
|
|
||||||
|
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
common.o: examples/common.cpp examples/common.h
|
common.o: examples/common.cpp examples/common.h
|
||||||
|
|
|
@ -400,12 +400,16 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
||||||
|
|
||||||
|
<!---
|
||||||
|
| LLAMA_CUDA_CUBLAS | Boolean | false | Use cuBLAS instead of custom CUDA kernels for prompt processing. Faster for all quantization formats except for q4_0 and q8_0, especially for k-quants. Increases VRAM usage (700 MiB for 7b, 970 MiB for 13b, 1430 MiB for 33b). |
|
||||||
|
--->
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|-------------------------|------------------------|---------|-------------|
|
|-------------------------|------------------------|---------|-------------|
|
||||||
|
| LLAMA_CUDA_MMQ_Y | Positive integer >= 32 | 64 | Tile size in y direction when using the custom CUDA kernels for prompt processing. Higher values can be faster depending on the amount of shared memory available. Power of 2 heavily recommended. |
|
||||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
|
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
|
||||||
- #### CLBlast
|
- #### CLBlast
|
||||||
|
|
|
@ -352,7 +352,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
params.main_gpu = std::stoi(argv[i]);
|
params.main_gpu = std::stoi(argv[i]);
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n");
|
||||||
#endif
|
#endif
|
||||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -376,13 +376,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
} else if (arg == "--mul-mat-q" || arg == "-mmq") {
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
params.mul_mat_q = true;
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
} else if (arg == "--low-vram" || arg == "-lv") {
|
} else if (arg == "--low-vram" || arg == "-lv") {
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
params.low_vram = true;
|
params.low_vram = true;
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
|
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
} else if (arg == "--no-mmap") {
|
} else if (arg == "--no-mmap") {
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
|
@ -585,6 +591,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
|
||||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
|
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
|
||||||
|
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
|
||||||
|
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
|
||||||
|
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
|
||||||
#endif
|
#endif
|
||||||
fprintf(stdout, " --mtest compute maximum memory usage\n");
|
fprintf(stdout, " --mtest compute maximum memory usage\n");
|
||||||
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
|
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
|
||||||
|
@ -637,6 +646,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
|
||||||
lparams.main_gpu = params.main_gpu;
|
lparams.main_gpu = params.main_gpu;
|
||||||
lparams.tensor_split = params.tensor_split;
|
lparams.tensor_split = params.tensor_split;
|
||||||
lparams.low_vram = params.low_vram;
|
lparams.low_vram = params.low_vram;
|
||||||
|
lparams.mul_mat_q = params.mul_mat_q;
|
||||||
lparams.seed = params.seed;
|
lparams.seed = params.seed;
|
||||||
lparams.f16_kv = params.memory_f16;
|
lparams.f16_kv = params.memory_f16;
|
||||||
lparams.use_mmap = params.use_mmap;
|
lparams.use_mmap = params.use_mmap;
|
||||||
|
|
|
@ -74,6 +74,7 @@ struct gpt_params {
|
||||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||||
|
|
||||||
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
|
bool low_vram = false; // if true, reduce VRAM usage at the cost of performance
|
||||||
|
bool mul_mat_q = false; // if true, use experimental mul_mat_q kernels
|
||||||
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
|
|
|
@ -163,7 +163,7 @@ node .
|
||||||
|
|
||||||
`content`: Set the text to tokenize.
|
`content`: Set the text to tokenize.
|
||||||
|
|
||||||
Note that the special `BOS` token is not added in fron of the text and also a space character is not inserted automatically as it is for `/completion`.
|
Note that the special `BOS` token is not added in front of the text and also a space character is not inserted automatically as it is for `/completion`.
|
||||||
|
|
||||||
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
|
- **POST** `/embedding`: Generate embedding of a given text just as [the embedding example](../embedding) does.
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -3,12 +3,11 @@
|
||||||
<head>
|
<head>
|
||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8">
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
|
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1" />
|
||||||
|
<meta name="color-scheme" content="light dark">
|
||||||
<title>llama.cpp - chat</title>
|
<title>llama.cpp - chat</title>
|
||||||
|
|
||||||
<style>
|
<style>
|
||||||
body {
|
body {
|
||||||
background-color: #fff;
|
|
||||||
color: #000;
|
|
||||||
font-family: system-ui;
|
font-family: system-ui;
|
||||||
font-size: 90%;
|
font-size: 90%;
|
||||||
}
|
}
|
||||||
|
|
|
@ -631,6 +631,9 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||||
|
fprintf(stdout, " -mmq, --mul-mat-q use experimental mul_mat_q CUDA kernels instead of cuBLAS. TEMP!!!\n" );
|
||||||
|
fprintf(stdout, " Reduces VRAM usage by 700/970/1430 MiB for 7b/13b/33b but prompt processing speed\n" );
|
||||||
|
fprintf(stdout, " is still suboptimal, especially q2_K, q3_K, q5_K, and q6_K.\n" );
|
||||||
#endif
|
#endif
|
||||||
fprintf(stdout, " -m FNAME, --model FNAME\n");
|
fprintf(stdout, " -m FNAME, --model FNAME\n");
|
||||||
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
|
fprintf(stdout, " model path (default: %s)\n", params.model.c_str());
|
||||||
|
@ -827,7 +830,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.", {});
|
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
}
|
}
|
||||||
else if (arg == "--low-vram" || arg == "-lv")
|
else if (arg == "--low-vram" || arg == "-lv")
|
||||||
|
@ -835,7 +838,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
params.low_vram = true;
|
params.low_vram = true;
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
|
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
}
|
||||||
|
else if (arg == "--mul-mat-q" || arg == "-mmq")
|
||||||
|
{
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
params.mul_mat_q = true;
|
||||||
|
#else
|
||||||
|
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to use mul_mat_q kernels.\n", {});
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
}
|
}
|
||||||
else if (arg == "--main-gpu" || arg == "-mg")
|
else if (arg == "--main-gpu" || arg == "-mg")
|
||||||
|
|
541
ggml-alloc.c
Normal file
541
ggml-alloc.c
Normal file
|
@ -0,0 +1,541 @@
|
||||||
|
#include "ggml-alloc.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include <assert.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#define UNUSED(x) (void)(x)
|
||||||
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
|
//#define GGML_ALLOCATOR_DEBUG
|
||||||
|
|
||||||
|
//#define AT_PRINTF printf
|
||||||
|
#define AT_PRINTF(...) ((void)0)
|
||||||
|
|
||||||
|
struct hash_node {
|
||||||
|
struct ggml_tensor * t;
|
||||||
|
int n_children;
|
||||||
|
int n_views;
|
||||||
|
};
|
||||||
|
|
||||||
|
static size_t hash(void * p) {
|
||||||
|
return (size_t)p % GGML_GRAPH_HASHTABLE_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct hash_node * hash_get(struct hash_node hash_table[], struct ggml_tensor * t) {
|
||||||
|
size_t h = hash(t);
|
||||||
|
|
||||||
|
// linear probing
|
||||||
|
size_t i = h;
|
||||||
|
while (hash_table[i].t != NULL) {
|
||||||
|
if (hash_table[i].t == t) {
|
||||||
|
return &hash_table[i];
|
||||||
|
}
|
||||||
|
i = (i + 1) % GGML_GRAPH_HASHTABLE_SIZE;
|
||||||
|
if (i == h) {
|
||||||
|
// hash table is full
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
hash_table[i].t = t;
|
||||||
|
return &hash_table[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: GGML_PAD ?
|
||||||
|
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
|
||||||
|
assert(alignment && !(alignment & (alignment - 1))); // power of 2
|
||||||
|
size_t align = (alignment - (((uintptr_t)buffer + offset) % alignment)) % alignment;
|
||||||
|
return offset + align;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct free_block {
|
||||||
|
void * addr;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
|
#define MAX_FREE_BLOCKS 128
|
||||||
|
|
||||||
|
struct ggml_allocr {
|
||||||
|
void * data;
|
||||||
|
size_t size;
|
||||||
|
size_t alignment;
|
||||||
|
int n_free_blocks;
|
||||||
|
struct free_block free_blocks[MAX_FREE_BLOCKS];
|
||||||
|
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
||||||
|
size_t max_size;
|
||||||
|
bool measure;
|
||||||
|
|
||||||
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
struct ggml_tensor * allocated_tensors[1024];
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
static void add_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
|
||||||
|
for (int i = 0; i < 1024; i++) {
|
||||||
|
if (alloc->allocated_tensors[i] == NULL) {
|
||||||
|
alloc->allocated_tensors[i] = tensor;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GGML_ASSERT(!"out of allocated_tensors");
|
||||||
|
}
|
||||||
|
static void remove_allocated_tensor(struct ggml_allocator * alloc, struct ggml_tensor * tensor) {
|
||||||
|
for (int i = 0; i < 1024; i++) {
|
||||||
|
if (alloc->allocated_tensors[i] == tensor ||
|
||||||
|
(alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
|
||||||
|
alloc->allocated_tensors[i] = NULL;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("tried to free tensor %s not found\n", tensor->name);
|
||||||
|
GGML_ASSERT(!"tensor not found");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
static size_t ggml_allocator_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||||
|
return ggml_nbytes(tensor);
|
||||||
|
|
||||||
|
UNUSED(alloc);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||||
|
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
||||||
|
size = aligned_offset(NULL, size, alloc->alignment);
|
||||||
|
|
||||||
|
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
|
||||||
|
|
||||||
|
size_t max_avail = 0;
|
||||||
|
|
||||||
|
// find the best fitting free block
|
||||||
|
int best_fit_block = -1;
|
||||||
|
size_t best_fit_size = SIZE_MAX;
|
||||||
|
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
||||||
|
struct free_block * block = &alloc->free_blocks[i];
|
||||||
|
max_avail = MAX(max_avail, block->size);
|
||||||
|
if (block->size >= size && block->size <= best_fit_size) {
|
||||||
|
best_fit_block = i;
|
||||||
|
best_fit_size = block->size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
AT_PRINTF("block %d\n", best_fit_block);
|
||||||
|
|
||||||
|
if (best_fit_block == -1) {
|
||||||
|
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
||||||
|
__func__, size, max_avail);
|
||||||
|
GGML_ASSERT(!"not enough space in the buffer");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
struct free_block * block = &alloc->free_blocks[best_fit_block];
|
||||||
|
void * addr = block->addr;
|
||||||
|
block->addr = (char*)block->addr + size;
|
||||||
|
block->size -= size;
|
||||||
|
if (block->size == 0) {
|
||||||
|
// remove block if empty
|
||||||
|
alloc->n_free_blocks--;
|
||||||
|
for (int j = best_fit_block; j < alloc->n_free_blocks; j++) {
|
||||||
|
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tensor->data = addr;
|
||||||
|
|
||||||
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
add_allocated_tensor(alloc, tensor);
|
||||||
|
size_t cur_max = (char*)addr - (char*)alloc->data + size;
|
||||||
|
if (cur_max > alloc->max_size) {
|
||||||
|
printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
||||||
|
for (int i = 0; i < 1024; i++) {
|
||||||
|
if (alloc->allocated_tensors[i]) {
|
||||||
|
printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->data + size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// this is a very naive implementation, but for our case the number of free blocks should be very small
|
||||||
|
static void ggml_allocator_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
|
||||||
|
void * ptr = tensor->data;
|
||||||
|
|
||||||
|
if (ptr < alloc->data || (char*)ptr >= (char*)alloc->data + alloc->max_size) {
|
||||||
|
// the tensor was not allocated in this buffer
|
||||||
|
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
|
||||||
|
// the easiest way to deal with this is just to ignore it
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t size = ggml_allocator_get_alloc_size(alloc, tensor);
|
||||||
|
size = aligned_offset(NULL, size, alloc->alignment);
|
||||||
|
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
||||||
|
|
||||||
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
remove_allocated_tensor(alloc, tensor);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// see if we can merge with an existing block
|
||||||
|
for (int i = 0; i < alloc->n_free_blocks; i++) {
|
||||||
|
struct free_block * block = &alloc->free_blocks[i];
|
||||||
|
// check if ptr is at the end of the block
|
||||||
|
if ((char*)block->addr + block->size == ptr) {
|
||||||
|
block->size += size;
|
||||||
|
// check if we can merge with the next block
|
||||||
|
if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
|
||||||
|
block->size += alloc->free_blocks[i+1].size;
|
||||||
|
alloc->n_free_blocks--;
|
||||||
|
for (int j = i+1; j < alloc->n_free_blocks; j++) {
|
||||||
|
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// check if ptr is at the beginning of the block
|
||||||
|
if ((char*)ptr + size == block->addr) {
|
||||||
|
block->addr = ptr;
|
||||||
|
block->size += size;
|
||||||
|
// check if we can merge with the previous block
|
||||||
|
if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
|
||||||
|
alloc->free_blocks[i-1].size += block->size;
|
||||||
|
alloc->n_free_blocks--;
|
||||||
|
for (int j = i; j < alloc->n_free_blocks; j++) {
|
||||||
|
alloc->free_blocks[j] = alloc->free_blocks[j+1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// otherwise, add a new block
|
||||||
|
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
|
||||||
|
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
|
||||||
|
int insert_pos = 0;
|
||||||
|
while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
|
||||||
|
insert_pos++;
|
||||||
|
}
|
||||||
|
// shift all blocks from insert_pos onward to make room for the new block
|
||||||
|
for (int i = alloc->n_free_blocks; i > insert_pos; i--) {
|
||||||
|
alloc->free_blocks[i] = alloc->free_blocks[i-1];
|
||||||
|
}
|
||||||
|
// insert the new block
|
||||||
|
alloc->free_blocks[insert_pos].addr = ptr;
|
||||||
|
alloc->free_blocks[insert_pos].size = size;
|
||||||
|
alloc->n_free_blocks++;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_allocr_reset(struct ggml_allocr * alloc) {
|
||||||
|
alloc->n_free_blocks = 1;
|
||||||
|
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
|
||||||
|
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
|
||||||
|
alloc->free_blocks[0].size = alloc->size - align_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
|
||||||
|
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||||
|
|
||||||
|
*alloc = (struct ggml_allocr){
|
||||||
|
/*.data = */ data,
|
||||||
|
/*.size = */ size,
|
||||||
|
/*.alignment = */ alignment,
|
||||||
|
/*.n_free_blocks = */ 0,
|
||||||
|
/*.free_blocks = */ {{0}},
|
||||||
|
/*.hash_table = */ {{0}},
|
||||||
|
/*.max_size = */ 0,
|
||||||
|
/*.measure = */ false,
|
||||||
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
/*.allocated_tensors = */ = {0},
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_allocr_reset(alloc);
|
||||||
|
|
||||||
|
return alloc;
|
||||||
|
}
|
||||||
|
|
||||||
|
// address and size of the buffer when measuring
|
||||||
|
// it needs to be large enough to fit all the tensors, but it cannot overlap with other existing buffers
|
||||||
|
static void * const MEASURE_BASE_ADDR = (void *) 0x1000;
|
||||||
|
static const size_t MEASURE_MAX_SIZE = 1ULL<<40; // 1 TB
|
||||||
|
|
||||||
|
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
|
||||||
|
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
|
||||||
|
|
||||||
|
*alloc = (struct ggml_allocr){
|
||||||
|
/*.data = */ MEASURE_BASE_ADDR,
|
||||||
|
/*.size = */ MEASURE_MAX_SIZE,
|
||||||
|
/*.alignment = */ alignment,
|
||||||
|
/*.n_free_blocks = */ 0,
|
||||||
|
/*.free_blocks = */ {{0}},
|
||||||
|
/*.hash_table = */ {{0}},
|
||||||
|
/*.max_size = */ 0,
|
||||||
|
/*.measure = */ true,
|
||||||
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
/*.allocated_tensors = */ = {0},
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_allocr_reset(alloc);
|
||||||
|
|
||||||
|
return alloc;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_allocr_free(struct ggml_allocr * alloc) {
|
||||||
|
free(alloc);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_allocr_is_measure(struct ggml_allocr * alloc) {
|
||||||
|
return alloc->measure;
|
||||||
|
}
|
||||||
|
|
||||||
|
//////////// compute graph allocator
|
||||||
|
|
||||||
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||||
|
return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
|
||||||
|
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||||
|
if (a->type != b->type) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||||
|
if (a->ne[i] != b->ne[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (a->nb[i] != b->nb[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_tensor * get_view_parent(struct ggml_tensor * t) {
|
||||||
|
switch (t->op) {
|
||||||
|
case GGML_OP_PERMUTE:
|
||||||
|
case GGML_OP_RESHAPE:
|
||||||
|
case GGML_OP_TRANSPOSE:
|
||||||
|
case GGML_OP_VIEW:
|
||||||
|
return t->src[0];
|
||||||
|
case GGML_OP_CPY:
|
||||||
|
return t->src[1];
|
||||||
|
default:
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_tensor * get_view_source(struct ggml_tensor * t) {
|
||||||
|
struct ggml_tensor * parent = t;
|
||||||
|
do {
|
||||||
|
parent = get_view_parent(parent);
|
||||||
|
} while (ggml_is_view(parent));
|
||||||
|
return parent;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_op_can_inplace(enum ggml_op op) {
|
||||||
|
switch (op) {
|
||||||
|
case GGML_OP_SCALE:
|
||||||
|
case GGML_OP_DIAG_MASK_ZERO:
|
||||||
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
|
case GGML_OP_ADD:
|
||||||
|
case GGML_OP_ADD1:
|
||||||
|
case GGML_OP_ACC:
|
||||||
|
case GGML_OP_SUB:
|
||||||
|
case GGML_OP_MUL:
|
||||||
|
case GGML_OP_DIV:
|
||||||
|
case GGML_OP_SQR:
|
||||||
|
case GGML_OP_SQRT:
|
||||||
|
case GGML_OP_LOG:
|
||||||
|
case GGML_OP_UNARY:
|
||||||
|
case GGML_OP_ROPE:
|
||||||
|
case GGML_OP_RMS_NORM:
|
||||||
|
case GGML_OP_SET:
|
||||||
|
case GGML_OP_SOFT_MAX:
|
||||||
|
case GGML_OP_CONT:
|
||||||
|
return true;
|
||||||
|
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
|
||||||
|
struct hash_node * ht = alloc->hash_table;
|
||||||
|
if (node->data == NULL) {
|
||||||
|
if (ggml_is_view(node)) {
|
||||||
|
size_t offset;
|
||||||
|
switch(node->op) {
|
||||||
|
case GGML_OP_VIEW:
|
||||||
|
memcpy(&offset, node->op_params, sizeof(size_t));
|
||||||
|
node->data = (char *) node->src[0]->data + offset;
|
||||||
|
break;
|
||||||
|
case GGML_OP_PERMUTE:
|
||||||
|
case GGML_OP_RESHAPE:
|
||||||
|
case GGML_OP_TRANSPOSE:
|
||||||
|
node->data = node->src[0]->data;
|
||||||
|
break;
|
||||||
|
case GGML_OP_CPY:
|
||||||
|
node->data = node->src[1]->data;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(!"unknown view op");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// see if we can reuse a parent's buffer (inplace)
|
||||||
|
if (ggml_op_can_inplace(node->op)) {
|
||||||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
|
struct ggml_tensor * parent = node->src[i];
|
||||||
|
if (parent == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
struct hash_node * p_hn = hash_get(ht, parent);
|
||||||
|
if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
|
||||||
|
if (ggml_is_view(parent)) {
|
||||||
|
struct ggml_tensor * view_src = get_view_source(parent);
|
||||||
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||||
|
if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
|
||||||
|
// TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
|
||||||
|
// the parent's data that it will need later (same layout requirement). the problem is that then
|
||||||
|
// we cannot free the tensor because the original address of the allocation is lost.
|
||||||
|
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
|
||||||
|
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
|
||||||
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
||||||
|
node->data = parent->data;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||||
|
node->data = parent->data;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ggml_allocr_alloc(alloc, node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_allocator_alloc_graph_tensors_n(
|
||||||
|
struct ggml_allocr * alloc,
|
||||||
|
struct ggml_cgraph ** graphs, int n_graphs,
|
||||||
|
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
|
||||||
|
|
||||||
|
// reset hash table
|
||||||
|
struct hash_node * ht = alloc->hash_table;
|
||||||
|
memset(ht, 0, sizeof(struct hash_node) * GGML_GRAPH_HASHTABLE_SIZE);
|
||||||
|
|
||||||
|
// count number of children and views
|
||||||
|
for (int g = 0; g < n_graphs; g++) {
|
||||||
|
struct ggml_cgraph * gf = graphs[g];
|
||||||
|
for (int i = 0; i < gf->n_nodes; i++) {
|
||||||
|
struct ggml_tensor * node = gf->nodes[i];
|
||||||
|
|
||||||
|
if (ggml_is_view(node)) {
|
||||||
|
struct ggml_tensor * view_src = get_view_source(node);
|
||||||
|
hash_get(ht, view_src)->n_views += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * parent = node->src[j];
|
||||||
|
if (parent == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
hash_get(ht, parent)->n_children += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate tensors
|
||||||
|
for (int g = 0; g < n_graphs; g++) {
|
||||||
|
struct ggml_cgraph * gf = graphs[g];
|
||||||
|
AT_PRINTF("####### graph %d/%d\n", g, n_graphs);
|
||||||
|
// graph inputs are allocated first to ensure that they are not overwritten by each other
|
||||||
|
if (inputs != NULL && inputs[g] != NULL) {
|
||||||
|
for (int i = 0; inputs[g][i] != NULL; i++) {
|
||||||
|
struct ggml_tensor * input = inputs[g][i];
|
||||||
|
AT_PRINTF("input: %s\n", input->name);
|
||||||
|
allocate_node(alloc, input);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < gf->n_nodes; i++) {
|
||||||
|
struct ggml_tensor * node = gf->nodes[i];
|
||||||
|
|
||||||
|
// allocate parents (leafs)
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * parent = node->src[j];
|
||||||
|
if (parent == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
allocate_node(alloc, parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate node
|
||||||
|
allocate_node(alloc, node);
|
||||||
|
|
||||||
|
AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * parent = node->src[j];
|
||||||
|
if (parent == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
AT_PRINTF("%s", parent->name);
|
||||||
|
if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
|
||||||
|
AT_PRINTF(", ");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
AT_PRINTF("\n");
|
||||||
|
|
||||||
|
// update parents
|
||||||
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
struct ggml_tensor * parent = node->src[j];
|
||||||
|
if (parent == NULL) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
struct hash_node * p_hn = hash_get(ht, parent);
|
||||||
|
p_hn->n_children -= 1;
|
||||||
|
|
||||||
|
//AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
|
||||||
|
|
||||||
|
if (p_hn->n_children == 0 && p_hn->n_views == 0) {
|
||||||
|
if (ggml_is_view(parent)) {
|
||||||
|
struct ggml_tensor * view_src = get_view_source(parent);
|
||||||
|
struct hash_node * view_src_hn = hash_get(ht, view_src);
|
||||||
|
view_src_hn->n_views -= 1;
|
||||||
|
AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src->n_children, view_src->n_views);
|
||||||
|
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src->data != node->data) {
|
||||||
|
ggml_allocator_free_tensor(alloc, view_src);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (parent->data != node->data) {
|
||||||
|
ggml_allocator_free_tensor(alloc, parent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
AT_PRINTF("\n");
|
||||||
|
}
|
||||||
|
// free graph outputs here that wouldn't be freed otherwise because they have no children
|
||||||
|
if (outputs != NULL && outputs[g] != NULL) {
|
||||||
|
for (int i = 0; outputs[g][i] != NULL; i++) {
|
||||||
|
struct ggml_tensor * output = outputs[g][i];
|
||||||
|
AT_PRINTF("output: %s\n", output->name);
|
||||||
|
ggml_allocator_free_tensor(alloc, output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return alloc->max_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
||||||
|
return ggml_allocator_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
||||||
|
}
|
22
ggml-alloc.h
Normal file
22
ggml-alloc.h
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
|
||||||
|
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
|
||||||
|
|
||||||
|
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
|
||||||
|
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
||||||
|
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
||||||
|
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
||||||
|
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
1994
ggml-cuda.cu
1994
ggml-cuda.cu
File diff suppressed because it is too large
Load diff
|
@ -27,6 +27,7 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
||||||
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
||||||
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
||||||
void ggml_cuda_set_main_device(int main_device);
|
void ggml_cuda_set_main_device(int main_device);
|
||||||
|
void ggml_cuda_set_mul_mat_q(bool mul_mat_q);
|
||||||
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
||||||
void ggml_cuda_free_scratch(void);
|
void ggml_cuda_free_scratch(void);
|
||||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||||
|
|
33
ggml-metal.m
33
ggml-metal.m
|
@ -718,7 +718,8 @@ void ggml_metal_graph_compute(
|
||||||
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
// TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224
|
||||||
|
|
||||||
GGML_ASSERT(ne00 == ne10);
|
GGML_ASSERT(ne00 == ne10);
|
||||||
GGML_ASSERT(ne02 == ne12);
|
// GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere
|
||||||
|
GGML_ASSERT(ne03 == ne13);
|
||||||
|
|
||||||
if (ggml_is_contiguous(src0) &&
|
if (ggml_is_contiguous(src0) &&
|
||||||
ggml_is_contiguous(src1) &&
|
ggml_is_contiguous(src1) &&
|
||||||
|
@ -746,11 +747,11 @@ void ggml_metal_graph_compute(
|
||||||
initWithDevice:ctx->device transposeLeft:false transposeRight:true
|
initWithDevice:ctx->device transposeLeft:false transposeRight:true
|
||||||
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
|
resultRows:ne11 resultColumns:ne01 interiorColumns:ne00 alpha:1.0 beta:0.0];
|
||||||
|
|
||||||
// we need to do ne02 multiplications
|
// we need to do ne12 multiplications
|
||||||
// TODO: is there a way to do this in parallel - currently very slow ..
|
// TODO: is there a way to do this in parallel - currently very slow ..
|
||||||
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
|
// TODO: might be possible to offload part of the computation to ANE using Accelerate's CBLAS
|
||||||
for (int64_t i02 = 0; i02 < ne02; ++i02) {
|
for (int64_t i02 = 0; i02 < ne12; ++i02) {
|
||||||
size_t offs_src0_cur = offs_src0 + i02*nb02;
|
size_t offs_src0_cur = offs_src0 + i02/(ne12/ne02)*nb02; // gqa not used for now
|
||||||
size_t offs_src1_cur = offs_src1 + i02*nb12;
|
size_t offs_src1_cur = offs_src1 + i02*nb12;
|
||||||
size_t offs_dst_cur = offs_dst + i02*nb2;
|
size_t offs_dst_cur = offs_dst + i02*nb2;
|
||||||
|
|
||||||
|
@ -772,8 +773,6 @@ void ggml_metal_graph_compute(
|
||||||
switch (src0t) {
|
switch (src0t) {
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne02 == ne12);
|
|
||||||
|
|
||||||
nth0 = 64;
|
nth0 = 64;
|
||||||
nth1 = 1;
|
nth1 = 1;
|
||||||
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
[encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32];
|
||||||
|
@ -853,16 +852,18 @@ void ggml_metal_graph_compute(
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
|
||||||
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
|
||||||
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:5];
|
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
|
||||||
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:6];
|
[encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
|
||||||
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:7];
|
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
|
||||||
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:8];
|
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
|
||||||
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:9];
|
[encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
|
||||||
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10];
|
[encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
|
||||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
|
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
|
||||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
|
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
|
||||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
|
||||||
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
|
||||||
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:15];
|
||||||
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:16];
|
||||||
|
|
||||||
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
|
||||||
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_Q4_K) {
|
||||||
|
|
|
@ -509,11 +509,13 @@ kernel void kernel_mul_mat_f16_f32(
|
||||||
device float * dst,
|
device float * dst,
|
||||||
constant int64_t & ne00,
|
constant int64_t & ne00,
|
||||||
constant int64_t & ne01,
|
constant int64_t & ne01,
|
||||||
|
constant int64_t & ne02,
|
||||||
constant uint64_t & nb00,
|
constant uint64_t & nb00,
|
||||||
constant uint64_t & nb01,
|
constant uint64_t & nb01,
|
||||||
constant uint64_t & nb02,
|
constant uint64_t & nb02,
|
||||||
constant int64_t & ne10,
|
constant int64_t & ne10,
|
||||||
constant int64_t & ne11,
|
constant int64_t & ne11,
|
||||||
|
constant int64_t & ne12,
|
||||||
constant uint64_t & nb10,
|
constant uint64_t & nb10,
|
||||||
constant uint64_t & nb11,
|
constant uint64_t & nb11,
|
||||||
constant uint64_t & nb12,
|
constant uint64_t & nb12,
|
||||||
|
@ -529,7 +531,7 @@ kernel void kernel_mul_mat_f16_f32(
|
||||||
const int64_t r1 = tgpig.y;
|
const int64_t r1 = tgpig.y;
|
||||||
const int64_t im = tgpig.z;
|
const int64_t im = tgpig.z;
|
||||||
|
|
||||||
device const half * x = (device const half *) (src0 + r0*nb01 + im*nb02);
|
device const half * x = (device const half *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
||||||
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
device const float * y = (device const float *) (src1 + r1*nb11 + im*nb12);
|
||||||
|
|
||||||
sum[tpitg.x] = 0.0f;
|
sum[tpitg.x] = 0.0f;
|
||||||
|
@ -552,6 +554,7 @@ kernel void kernel_mul_mat_f16_f32(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
kernel void kernel_alibi_f32(
|
kernel void kernel_alibi_f32(
|
||||||
device const float * src0,
|
device const float * src0,
|
||||||
device float * dst,
|
device float * dst,
|
||||||
|
|
75
ggml.c
75
ggml.c
|
@ -4557,10 +4557,12 @@ static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml
|
||||||
|
|
||||||
static struct ggml_tensor * ggml_new_tensor_impl(
|
static struct ggml_tensor * ggml_new_tensor_impl(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int n_dims,
|
int n_dims,
|
||||||
const int64_t* ne,
|
const int64_t * ne,
|
||||||
void* data) {
|
void * data) {
|
||||||
|
|
||||||
|
assert(n_dims >= 1 && n_dims <= GGML_MAX_DIMS);
|
||||||
|
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
|
|
||||||
|
@ -4648,22 +4650,22 @@ static void ggml_set_op_params_i32(struct ggml_tensor * tensor, uint32_t i, int3
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor(
|
struct ggml_tensor * ggml_new_tensor(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int n_dims,
|
int n_dims,
|
||||||
const int64_t * ne) {
|
const int64_t * ne) {
|
||||||
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor_1d(
|
struct ggml_tensor * ggml_new_tensor_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int64_t ne0) {
|
int64_t ne0) {
|
||||||
return ggml_new_tensor(ctx, type, 1, &ne0);
|
return ggml_new_tensor(ctx, type, 1, &ne0);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor_2d(
|
struct ggml_tensor * ggml_new_tensor_2d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int64_t ne0,
|
int64_t ne0,
|
||||||
int64_t ne1) {
|
int64_t ne1) {
|
||||||
const int64_t ne[2] = { ne0, ne1 };
|
const int64_t ne[2] = { ne0, ne1 };
|
||||||
|
@ -4672,7 +4674,7 @@ struct ggml_tensor * ggml_new_tensor_2d(
|
||||||
|
|
||||||
struct ggml_tensor * ggml_new_tensor_3d(
|
struct ggml_tensor * ggml_new_tensor_3d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
enum ggml_type type,
|
enum ggml_type type,
|
||||||
int64_t ne0,
|
int64_t ne0,
|
||||||
int64_t ne1,
|
int64_t ne1,
|
||||||
int64_t ne2) {
|
int64_t ne2) {
|
||||||
|
@ -6238,6 +6240,27 @@ struct ggml_tensor * ggml_reshape_4d(
|
||||||
|
|
||||||
// ggml_view_1d
|
// ggml_view_1d
|
||||||
|
|
||||||
|
static struct ggml_tensor * ggml_view_tensor_offset(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int n_dims,
|
||||||
|
const int64_t * ne,
|
||||||
|
size_t offset) {
|
||||||
|
// don't calculate an offset from an unallocated tensor
|
||||||
|
void * data = NULL;
|
||||||
|
if (a->data != NULL) {
|
||||||
|
data = (char *) a->data + offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, data);
|
||||||
|
|
||||||
|
ggml_format_name(result, "%s (view)", a->name);
|
||||||
|
|
||||||
|
ggml_set_op_params(result, &offset, sizeof(offset));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_view_1d(
|
struct ggml_tensor * ggml_view_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -6250,10 +6273,7 @@ struct ggml_tensor * ggml_view_1d(
|
||||||
is_node = true;
|
is_node = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);
|
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 1, &ne0, offset);
|
||||||
ggml_format_name(result, "%s (view)", a->name);
|
|
||||||
|
|
||||||
ggml_set_op_params(result, &offset, sizeof(offset));
|
|
||||||
|
|
||||||
result->op = GGML_OP_VIEW;
|
result->op = GGML_OP_VIEW;
|
||||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
@ -6280,10 +6300,7 @@ struct ggml_tensor * ggml_view_2d(
|
||||||
|
|
||||||
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);
|
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 2, ne, offset);
|
||||||
ggml_format_name(result, "%s (view)", a->name);
|
|
||||||
|
|
||||||
ggml_set_op_params(result, &offset, sizeof(offset));
|
|
||||||
|
|
||||||
result->nb[1] = nb1;
|
result->nb[1] = nb1;
|
||||||
result->nb[2] = result->nb[1]*ne1;
|
result->nb[2] = result->nb[1]*ne1;
|
||||||
|
@ -6316,10 +6333,7 @@ struct ggml_tensor * ggml_view_3d(
|
||||||
|
|
||||||
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, 1 };
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset);
|
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 3, ne, offset);
|
||||||
ggml_format_name(result, "%s (view)", a->name);
|
|
||||||
|
|
||||||
ggml_set_op_params(result, &offset, sizeof(offset));
|
|
||||||
|
|
||||||
result->nb[1] = nb1;
|
result->nb[1] = nb1;
|
||||||
result->nb[2] = nb2;
|
result->nb[2] = nb2;
|
||||||
|
@ -6354,10 +6368,7 @@ struct ggml_tensor * ggml_view_4d(
|
||||||
|
|
||||||
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
const int64_t ne[GGML_MAX_DIMS] = { ne0, ne1, ne2, ne3 };
|
||||||
|
|
||||||
struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset);
|
struct ggml_tensor * result = ggml_view_tensor_offset(ctx, a, 4, ne, offset);
|
||||||
ggml_format_name(result, "%s (view)", a->name);
|
|
||||||
|
|
||||||
ggml_set_op_params(result, &offset, sizeof(offset));
|
|
||||||
|
|
||||||
result->nb[1] = nb1;
|
result->nb[1] = nb1;
|
||||||
result->nb[2] = nb2;
|
result->nb[2] = nb2;
|
||||||
|
@ -6741,6 +6752,18 @@ struct ggml_tensor * ggml_rope_inplace(
|
||||||
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, 10000.0f, 1.0f, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_rope_custom(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int n_past,
|
||||||
|
int n_dims,
|
||||||
|
int mode,
|
||||||
|
int n_ctx,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale) {
|
||||||
|
return ggml_rope_impl(ctx, a, n_past, n_dims, mode, n_ctx, freq_base, freq_scale, false);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ggml_rope_custom_inplace(
|
struct ggml_tensor * ggml_rope_custom_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
|
13
ggml.h
13
ggml.h
|
@ -1170,7 +1170,18 @@ extern "C" {
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx);
|
int n_ctx);
|
||||||
|
|
||||||
// custom RoPE, in-place, returns view(a)
|
// custom RoPE
|
||||||
|
GGML_API struct ggml_tensor * ggml_rope_custom(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int n_past,
|
||||||
|
int n_dims,
|
||||||
|
int mode,
|
||||||
|
int n_ctx,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale);
|
||||||
|
|
||||||
|
// in-place, returns view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
|
261
llama.cpp
261
llama.cpp
|
@ -56,8 +56,14 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
|
||||||
|
#include "ggml-alloc.h"
|
||||||
|
#define LLAMA_USE_ALLOCATOR
|
||||||
|
#else
|
||||||
#define LLAMA_USE_SCRATCH
|
#define LLAMA_USE_SCRATCH
|
||||||
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
#define LLAMA_MAX_SCRATCH_BUFFERS 16
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// available llama models
|
// available llama models
|
||||||
enum e_model {
|
enum e_model {
|
||||||
|
@ -327,13 +333,22 @@ struct llama_model {
|
||||||
|
|
||||||
struct llama_context {
|
struct llama_context {
|
||||||
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
llama_context(const llama_model & model) : model(model), t_load_us(model.t_load_us), t_start_us(model.t_start_us) {}
|
||||||
#ifdef GGML_USE_METAL
|
|
||||||
~llama_context() {
|
~llama_context() {
|
||||||
|
if (model_owner) {
|
||||||
|
delete &model;
|
||||||
|
}
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
if (ctx_metal) {
|
if (ctx_metal) {
|
||||||
ggml_metal_free(ctx_metal);
|
ggml_metal_free(ctx_metal);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef LLAMA_USE_ALLOCATOR
|
||||||
|
if (alloc) {
|
||||||
|
ggml_allocr_free(alloc);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
std::mt19937 rng;
|
std::mt19937 rng;
|
||||||
|
|
||||||
bool has_evaluated_once = false;
|
bool has_evaluated_once = false;
|
||||||
|
@ -371,7 +386,17 @@ struct llama_context {
|
||||||
// memory buffers used to evaluate the model
|
// memory buffers used to evaluate the model
|
||||||
// TODO: move in llama_state
|
// TODO: move in llama_state
|
||||||
llama_ctx_buffer buf_compute;
|
llama_ctx_buffer buf_compute;
|
||||||
|
|
||||||
|
#ifdef LLAMA_USE_ALLOCATOR
|
||||||
|
llama_ctx_buffer buf_alloc;
|
||||||
|
ggml_allocr * alloc = NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LLAMA_USE_SCRATCH
|
||||||
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
||||||
|
int buf_last = 0;
|
||||||
|
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
ggml_metal_context * ctx_metal = NULL;
|
ggml_metal_context * ctx_metal = NULL;
|
||||||
|
@ -381,9 +406,6 @@ struct llama_context {
|
||||||
ggml_mpi_context * ctx_mpi = NULL;
|
ggml_mpi_context * ctx_mpi = NULL;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int buf_last = 0;
|
|
||||||
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
|
||||||
|
|
||||||
void use_buf(struct ggml_context * ctx, int i) {
|
void use_buf(struct ggml_context * ctx, int i) {
|
||||||
#if defined(LLAMA_USE_SCRATCH)
|
#if defined(LLAMA_USE_SCRATCH)
|
||||||
size_t last_size = 0;
|
size_t last_size = 0;
|
||||||
|
@ -879,6 +901,7 @@ struct llama_context_params llama_context_default_params() {
|
||||||
/*.progress_callback =*/ nullptr,
|
/*.progress_callback =*/ nullptr,
|
||||||
/*.progress_callback_user_data =*/ nullptr,
|
/*.progress_callback_user_data =*/ nullptr,
|
||||||
/*.low_vram =*/ false,
|
/*.low_vram =*/ false,
|
||||||
|
/*.mul_mat_q =*/ false,
|
||||||
/*.f16_kv =*/ true,
|
/*.f16_kv =*/ true,
|
||||||
/*.logits_all =*/ false,
|
/*.logits_all =*/ false,
|
||||||
/*.vocab_only =*/ false,
|
/*.vocab_only =*/ false,
|
||||||
|
@ -1006,6 +1029,7 @@ static void llama_model_load_internal(
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
const float * tensor_split,
|
const float * tensor_split,
|
||||||
|
const bool mul_mat_q,
|
||||||
float rope_freq_base,
|
float rope_freq_base,
|
||||||
float rope_freq_scale,
|
float rope_freq_scale,
|
||||||
bool low_vram,
|
bool low_vram,
|
||||||
|
@ -1134,9 +1158,11 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
(void) main_gpu;
|
(void) main_gpu;
|
||||||
|
(void) mul_mat_q;
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
|
||||||
ggml_cuda_set_main_device(main_gpu);
|
ggml_cuda_set_main_device(main_gpu);
|
||||||
|
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
||||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||||
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
#define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
|
@ -1230,12 +1256,16 @@ static void llama_model_load_internal(
|
||||||
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
|
||||||
|
|
||||||
// this is the total memory required to run the inference
|
// this is the total memory required to run the inference
|
||||||
const size_t mem_required =
|
size_t mem_required =
|
||||||
ctx_size +
|
ctx_size +
|
||||||
mmapped_size - vram_weights + // weights in VRAM not in memory
|
mmapped_size - vram_weights; // weights in VRAM not in memory
|
||||||
|
|
||||||
|
#ifndef LLAMA_USE_ALLOCATOR
|
||||||
|
mem_required +=
|
||||||
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) +
|
||||||
MEM_REQ_SCRATCH1().at(model.type) +
|
MEM_REQ_SCRATCH1().at(model.type) +
|
||||||
MEM_REQ_EVAL().at(model.type);
|
MEM_REQ_EVAL().at(model.type);
|
||||||
|
#endif
|
||||||
|
|
||||||
// this is the memory required by one llama_state
|
// this is the memory required by one llama_state
|
||||||
const size_t mem_required_state =
|
const size_t mem_required_state =
|
||||||
|
@ -1341,6 +1371,7 @@ static bool llama_model_load(
|
||||||
int n_gpu_layers,
|
int n_gpu_layers,
|
||||||
int main_gpu,
|
int main_gpu,
|
||||||
const float * tensor_split,
|
const float * tensor_split,
|
||||||
|
const bool mul_mat_q,
|
||||||
float rope_freq_base,
|
float rope_freq_base,
|
||||||
float rope_freq_scale,
|
float rope_freq_scale,
|
||||||
bool low_vram,
|
bool low_vram,
|
||||||
|
@ -1351,7 +1382,8 @@ static bool llama_model_load(
|
||||||
llama_progress_callback progress_callback,
|
llama_progress_callback progress_callback,
|
||||||
void *progress_callback_user_data) {
|
void *progress_callback_user_data) {
|
||||||
try {
|
try {
|
||||||
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, rms_norm_eps, n_gpu_layers,
|
||||||
|
main_gpu, tensor_split, mul_mat_q, rope_freq_base, rope_freq_scale, low_vram, memory_type,
|
||||||
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
|
||||||
return true;
|
return true;
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
|
@ -1360,32 +1392,15 @@ static bool llama_model_load(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// evaluate the transformer
|
static struct ggml_cgraph * llama_build_graph(
|
||||||
//
|
|
||||||
// - lctx: llama context
|
|
||||||
// - tokens: new batch of tokens to process
|
|
||||||
// - embd embeddings input
|
|
||||||
// - n_tokens number of tokens
|
|
||||||
// - n_past: the context size so far
|
|
||||||
// - n_threads: number of threads to use
|
|
||||||
//
|
|
||||||
static bool llama_eval_internal(
|
|
||||||
llama_context & lctx,
|
llama_context & lctx,
|
||||||
const llama_token * tokens,
|
const llama_token * tokens,
|
||||||
const float * embd,
|
const float * embd,
|
||||||
int n_tokens,
|
int n_tokens,
|
||||||
int n_past,
|
int n_past) {
|
||||||
int n_threads,
|
|
||||||
const char * cgraph_fname) {
|
|
||||||
|
|
||||||
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
||||||
|
|
||||||
#ifdef GGML_USE_MPI
|
|
||||||
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const int64_t t_start_us = ggml_time_us();
|
|
||||||
|
|
||||||
const int N = n_tokens;
|
const int N = n_tokens;
|
||||||
|
|
||||||
const auto & model = lctx.model;
|
const auto & model = lctx.model;
|
||||||
|
@ -1401,10 +1416,8 @@ static bool llama_eval_internal(
|
||||||
const int64_t n_head = hparams.n_head;
|
const int64_t n_head = hparams.n_head;
|
||||||
const int64_t n_head_kv = hparams.n_head_kv;
|
const int64_t n_head_kv = hparams.n_head_kv;
|
||||||
const int64_t n_embd_head = hparams.n_embd_head();
|
const int64_t n_embd_head = hparams.n_embd_head();
|
||||||
const int64_t n_vocab = hparams.n_vocab;
|
|
||||||
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
const int64_t n_embd_gqa = hparams.n_embd_gqa();
|
||||||
|
|
||||||
|
|
||||||
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
LLAMA_ASSERT(n_embd_head == hparams.n_rot);
|
||||||
|
|
||||||
const float freq_base = hparams.rope_freq_base;
|
const float freq_base = hparams.rope_freq_base;
|
||||||
|
@ -1416,26 +1429,35 @@ static bool llama_eval_internal(
|
||||||
auto & mem_per_token = lctx.mem_per_token;
|
auto & mem_per_token = lctx.mem_per_token;
|
||||||
auto & buf_compute = lctx.buf_compute;
|
auto & buf_compute = lctx.buf_compute;
|
||||||
|
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ buf_compute.size,
|
/*.mem_size =*/ buf_compute.size,
|
||||||
/*.mem_buffer =*/ buf_compute.addr,
|
/*.mem_buffer =*/ buf_compute.addr,
|
||||||
/*.no_alloc =*/ false,
|
/*.no_alloc =*/ false,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef LLAMA_USE_ALLOCATOR
|
||||||
|
params.no_alloc = true;
|
||||||
|
#endif
|
||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
|
||||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
|
||||||
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
if (tokens) {
|
if (tokens) {
|
||||||
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
|
|
||||||
|
#ifdef LLAMA_USE_ALLOCATOR
|
||||||
|
ggml_allocr_alloc(lctx.alloc, inp_tokens);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
||||||
|
}
|
||||||
|
#else
|
||||||
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
|
||||||
|
#endif
|
||||||
ggml_set_name(inp_tokens, "inp_tokens");
|
ggml_set_name(inp_tokens, "inp_tokens");
|
||||||
|
|
||||||
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
|
||||||
|
@ -1445,7 +1467,15 @@ static bool llama_eval_internal(
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
|
||||||
|
|
||||||
|
#ifdef LLAMA_USE_ALLOCATOR
|
||||||
|
ggml_allocr_alloc(lctx.alloc, inpL);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
||||||
|
}
|
||||||
|
#else
|
||||||
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
const int i_gpu_start = n_layer - n_gpu_layers;
|
const int i_gpu_start = n_layer - n_gpu_layers;
|
||||||
|
@ -1472,6 +1502,17 @@ static bool llama_eval_internal(
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
||||||
|
#ifdef LLAMA_USE_ALLOCATOR
|
||||||
|
ggml_allocr_alloc(lctx.alloc, KQ_scale);
|
||||||
|
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
||||||
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||||
|
#endif
|
||||||
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
||||||
|
|
||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_format_name(inpL, "layer_inp_%d", il);
|
ggml_format_name(inpL, "layer_inp_%d", il);
|
||||||
|
|
||||||
|
@ -1567,9 +1608,6 @@ static bool llama_eval_internal(
|
||||||
ggml_set_name(KQ, "KQ");
|
ggml_set_name(KQ, "KQ");
|
||||||
|
|
||||||
// KQ_scaled = KQ / sqrt(n_embd_head)
|
// KQ_scaled = KQ / sqrt(n_embd_head)
|
||||||
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
|
||||||
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
|
|
||||||
|
|
||||||
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
||||||
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
||||||
offload_func_kq(KQ_scaled);
|
offload_func_kq(KQ_scaled);
|
||||||
|
@ -1685,9 +1723,6 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
lctx.use_buf(ctx0, 0);
|
lctx.use_buf(ctx0, 0);
|
||||||
|
|
||||||
// used at the end to optionally extract the embeddings
|
|
||||||
struct ggml_tensor * embeddings = NULL;
|
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
{
|
{
|
||||||
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps);
|
||||||
|
@ -1698,8 +1733,6 @@ static bool llama_eval_internal(
|
||||||
cur = ggml_mul(ctx0, cur, model.norm);
|
cur = ggml_mul(ctx0, cur, model.norm);
|
||||||
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
|
||||||
ggml_set_name(cur, "result_norm");
|
ggml_set_name(cur, "result_norm");
|
||||||
|
|
||||||
embeddings = cur;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// lm_head
|
// lm_head
|
||||||
|
@ -1711,12 +1744,88 @@ static bool llama_eval_internal(
|
||||||
// logits -> probs
|
// logits -> probs
|
||||||
//cur = ggml_soft_max_inplace(ctx0, cur);
|
//cur = ggml_soft_max_inplace(ctx0, cur);
|
||||||
|
|
||||||
// run the computation
|
|
||||||
ggml_build_forward_expand(gf, cur);
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
|
if (mem_per_token == 0) {
|
||||||
|
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
||||||
|
ggml_used_mem(ctx0)/1024.0/1024.0,
|
||||||
|
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
||||||
|
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
||||||
|
lctx.work_buffer.size()/1024.0/1024.0,
|
||||||
|
n_past, N);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ggml_free(ctx0);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
// evaluate the transformer
|
||||||
|
//
|
||||||
|
// - lctx: llama context
|
||||||
|
// - tokens: new batch of tokens to process
|
||||||
|
// - embd embeddings input
|
||||||
|
// - n_tokens number of tokens
|
||||||
|
// - n_past: the context size so far
|
||||||
|
// - n_threads: number of threads to use
|
||||||
|
//
|
||||||
|
static bool llama_eval_internal(
|
||||||
|
llama_context & lctx,
|
||||||
|
const llama_token * tokens,
|
||||||
|
const float * embd,
|
||||||
|
int n_tokens,
|
||||||
|
int n_past,
|
||||||
|
int n_threads,
|
||||||
|
const char * cgraph_fname) {
|
||||||
|
|
||||||
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
||||||
|
|
||||||
|
const int64_t t_start_us = ggml_time_us();
|
||||||
|
|
||||||
|
#ifdef GGML_USE_MPI
|
||||||
|
ggml_mpi_eval_init(lctx.ctx_mpi, &n_tokens, &n_past, &n_threads);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
const int N = n_tokens;
|
||||||
|
|
||||||
|
const auto & model = lctx.model;
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
const auto & kv_self = lctx.kv_self;
|
||||||
|
|
||||||
|
LLAMA_ASSERT(!!kv_self.ctx);
|
||||||
|
|
||||||
|
const int64_t n_embd = hparams.n_embd;
|
||||||
|
const int64_t n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
|
#ifdef LLAMA_USE_ALLOCATOR
|
||||||
|
ggml_allocr_reset(lctx.alloc);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
|
||||||
|
|
||||||
|
#ifdef LLAMA_USE_ALLOCATOR
|
||||||
|
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||||
|
|
||||||
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||||
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||||
|
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||||
|
|
||||||
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
||||||
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
||||||
|
|
||||||
|
LLAMA_ASSERT(strcmp(res->name, "result_output") == 0);
|
||||||
|
LLAMA_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
||||||
|
|
||||||
#if GGML_USE_MPI
|
#if GGML_USE_MPI
|
||||||
|
const int64_t n_layer = hparams.n_layer;
|
||||||
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -1728,7 +1837,10 @@ static bool llama_eval_internal(
|
||||||
//}
|
//}
|
||||||
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||||
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
ggml_metal_graph_compute(lctx.ctx_metal, gf);
|
||||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
ggml_metal_get_tensor (lctx.ctx_metal, res);
|
||||||
|
if (!lctx.embedding.empty()) {
|
||||||
|
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// IMPORTANT:
|
// IMPORTANT:
|
||||||
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
// Since we don't have efficient Matrix x Matrix Metal multiplication yet, we fallback to vanilla
|
||||||
|
@ -1759,8 +1871,6 @@ static bool llama_eval_internal(
|
||||||
// update kv token count
|
// update kv token count
|
||||||
lctx.kv_self.n = n_past + N;
|
lctx.kv_self.n = n_past + N;
|
||||||
|
|
||||||
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
|
||||||
|
|
||||||
if (cgraph_fname) {
|
if (cgraph_fname) {
|
||||||
ggml_graph_export(gf, cgraph_fname);
|
ggml_graph_export(gf, cgraph_fname);
|
||||||
}
|
}
|
||||||
|
@ -1798,21 +1908,6 @@ static bool llama_eval_internal(
|
||||||
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mem_per_token == 0) {
|
|
||||||
mem_per_token = ggml_used_mem(ctx0)/N;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if 0
|
|
||||||
printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__,
|
|
||||||
ggml_used_mem(ctx0)/1024.0/1024.0,
|
|
||||||
lctx.get_buf_max_mem(0)/1024.0/1024.0,
|
|
||||||
lctx.get_buf_max_mem(1)/1024.0/1024.0,
|
|
||||||
lctx.work_buffer.size()/1024.0/1024.0,
|
|
||||||
n_past, N);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ggml_free(ctx0);
|
|
||||||
|
|
||||||
// measure the performance only for the single-token evals
|
// measure the performance only for the single-token evals
|
||||||
if (N == 1) {
|
if (N == 1) {
|
||||||
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
lctx.t_eval_us += ggml_time_us() - t_start_us;
|
||||||
|
@ -3103,7 +3198,7 @@ struct llama_model * llama_load_model_from_file(
|
||||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.rms_norm_eps, params.n_gpu_layers,
|
||||||
params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,params.low_vram,
|
||||||
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback,
|
||||||
params.progress_callback_user_data)) {
|
params.progress_callback_user_data)) {
|
||||||
delete model;
|
delete model;
|
||||||
|
@ -3180,10 +3275,47 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->embedding.resize(hparams.n_embd);
|
ctx->embedding.resize(hparams.n_embd);
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
#ifdef LLAMA_USE_ALLOCATOR
|
||||||
|
{
|
||||||
|
static const size_t tensor_alignment = 32;
|
||||||
|
// the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
|
||||||
|
ctx->buf_compute.resize(ggml_tensor_overhead()*GGML_MAX_NODES + ggml_graph_overhead());
|
||||||
|
|
||||||
|
// create measure allocator
|
||||||
|
ctx->alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||||
|
|
||||||
|
// build worst-case graph
|
||||||
|
int n_tokens = std::min((int)hparams.n_ctx, params.n_batch);
|
||||||
|
int n_past = hparams.n_ctx - n_tokens;
|
||||||
|
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
||||||
|
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
|
||||||
|
|
||||||
|
// measure memory requirements for the graph
|
||||||
|
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: compute buffer total size = %7.2f MB\n", __func__, (ctx->buf_compute.size + alloc_size) / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
// debug - for comparison with scratch buffer
|
||||||
|
//size_t prev_req =
|
||||||
|
// MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type) +
|
||||||
|
// MEM_REQ_SCRATCH1().at(ctx->model.type) +
|
||||||
|
// MEM_REQ_EVAL().at(ctx->model.type);
|
||||||
|
//fprintf(stderr, "%s: (debug) equivalent with scratch buffer = %7.2f MB\n", __func__, prev_req / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
// recreate allocator with exact memory requirements
|
||||||
|
ggml_allocr_free(ctx->alloc);
|
||||||
|
|
||||||
|
ctx->buf_alloc.resize(alloc_size);
|
||||||
|
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef LLAMA_USE_SCRATCH
|
||||||
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
|
||||||
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
|
@ -3253,9 +3385,6 @@ struct llama_context * llama_init_from_file(
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_free(struct llama_context * ctx) {
|
void llama_free(struct llama_context * ctx) {
|
||||||
if (ctx->model_owner) {
|
|
||||||
delete &ctx->model;
|
|
||||||
}
|
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
1
llama.h
1
llama.h
|
@ -108,6 +108,7 @@ extern "C" {
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
bool low_vram; // if true, reduce VRAM usage at the cost of performance
|
||||||
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels
|
||||||
bool f16_kv; // use fp16 for KV cache
|
bool f16_kv; // use fp16 for KV cache
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
bool vocab_only; // only load the vocabulary, no weights
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue