rebased and trimmed down
now compiling again,
This commit is contained in:
parent
8e672efe63
commit
ee9b0bceeb
15 changed files with 2302 additions and 823 deletions
|
@ -1,7 +1,33 @@
|
||||||
cmake_minimum_required(VERSION 3.13) # for add_link_options
|
cmake_minimum_required(VERSION 3.13) # for add_link_options
|
||||||
project("llama.cpp" C CXX)
|
project("llama.cpp" C CXX)
|
||||||
|
|
||||||
|
if (NOT MSVC)
|
||||||
|
set(cuda_flags -Wno-pedantic)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(LLAMA_CUBLAS ON)
|
||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
set(LLAMA_CUDA_F16 ON)
|
||||||
|
set(LLAMA_ACCELERATE ON)
|
||||||
|
set(LLAMA_K_QUANTS ON)
|
||||||
|
|
||||||
|
#-DLLAMA_NATIVE=off
|
||||||
|
set(LLAMA_AVX ON)
|
||||||
|
set(LLAMA_AVX2 OFF)
|
||||||
|
set(LLAMA_AVX512 OFF)
|
||||||
|
set(LLAMA_FMA OFF)
|
||||||
|
set(LLAMA_F16C OFF)
|
||||||
|
set(CMAKE_CUDA_FLAGS "--verbose") #
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||||
|
set(CUDACXX /usr/local/cuda-12.3/bin/nvcc)
|
||||||
|
set(CMAKE_CUDA_COMPILER /usr/local/cuda-12.3/bin/nvcc)
|
||||||
|
set(CUDA_TOOLKIT_ROOT_DIR /usr/local/cuda-12.3)
|
||||||
|
#GGML_USE_CUBLAS
|
||||||
|
|
||||||
|
#set(CMAKE_EXE_LINKER_FLAGS -pg)
|
||||||
|
#set(CMAKE_SHARED_LINKER_FLAGS -pg)
|
||||||
|
|
||||||
|
set(CMAKE_BUILD_TYPE Debug CACHE STRING "Build type" FORCE)
|
||||||
|
|
||||||
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
||||||
|
@ -44,7 +70,7 @@ endif()
|
||||||
|
|
||||||
# general
|
# general
|
||||||
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||||
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
|
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
|
||||||
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
||||||
|
|
||||||
# debug
|
# debug
|
||||||
|
@ -77,9 +103,9 @@ endif()
|
||||||
|
|
||||||
# 3rd party libs
|
# 3rd party libs
|
||||||
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
option(LLAMA_BLAS "llama: use BLAS" ON)
|
||||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
|
option(LLAMA_CUBLAS "llama: use CUDA" ON)
|
||||||
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
||||||
|
@ -104,7 +130,7 @@ option(LLAMA_BUILD_SERVER "llama: build server example"
|
||||||
# Compile flags
|
# Compile flags
|
||||||
#
|
#
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 11)
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||||
set(CMAKE_C_STANDARD 11)
|
set(CMAKE_C_STANDARD 11)
|
||||||
set(CMAKE_C_STANDARD_REQUIRED true)
|
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||||
|
@ -230,7 +256,12 @@ if (LLAMA_BLAS)
|
||||||
|
|
||||||
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
|
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
|
||||||
add_compile_options(${BLAS_LINKER_FLAGS})
|
add_compile_options(${BLAS_LINKER_FLAGS})
|
||||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
|
||||||
|
# from https://github.com/NVIDIA/cutlass
|
||||||
|
make_directory("${PROJECT_BINARY_DIR}/nvcc_tmp")
|
||||||
|
set(cuda_flags --keep "SHELL:--keep-dir ${PROJECT_BINARY_DIR}/nvcc_tmp" ${cuda_flags})
|
||||||
|
|
||||||
|
# add_compile_definitions(GGML_USE_OPENBLAS)
|
||||||
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
|
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
|
||||||
add_compile_definitions(GGML_BLAS_USE_MKL)
|
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||||
endif()
|
endif()
|
||||||
|
@ -272,6 +303,7 @@ if (LLAMA_CUBLAS)
|
||||||
endif()
|
endif()
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
|
|
||||||
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
||||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
|
||||||
endif()
|
endif()
|
||||||
|
@ -312,7 +344,7 @@ if (LLAMA_MPI)
|
||||||
if (MPI_C_FOUND)
|
if (MPI_C_FOUND)
|
||||||
message(STATUS "MPI found")
|
message(STATUS "MPI found")
|
||||||
set(GGML_HEADERS_MPI ggml-mpi.h)
|
set(GGML_HEADERS_MPI ggml-mpi.h)
|
||||||
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
|
set(GGML_SOURCES_MPI ggml-mpi.cpp ggml-mpi.h)
|
||||||
add_compile_definitions(GGML_USE_MPI)
|
add_compile_definitions(GGML_USE_MPI)
|
||||||
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
|
@ -390,14 +422,15 @@ endif()
|
||||||
|
|
||||||
if (LLAMA_ALL_WARNINGS)
|
if (LLAMA_ALL_WARNINGS)
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
set(warning_flags -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
|
# -Wpedantic
|
||||||
|
set(warning_flags -Wall -Wextra -Wcast-qual -Wno-unused-function)
|
||||||
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
|
set(c_flags -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration)
|
||||||
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn)
|
set(cxx_flags -Wmissing-declarations -Wmissing-noreturn -fpermissive)
|
||||||
set(host_cxx_flags "")
|
set(host_cxx_flags "")
|
||||||
|
|
||||||
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
|
if (CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||||
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
|
set(warning_flags ${warning_flags} -Wunreachable-code-break -Wunreachable-code-return)
|
||||||
set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi)
|
set(host_cxx_flags ${host_cxx_flags} -Wmissing-prototypes -Wextra-semi -fpermissive)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
|
(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER_EQUAL 3.8.0) OR
|
||||||
|
@ -407,30 +440,27 @@ if (LLAMA_ALL_WARNINGS)
|
||||||
endif()
|
endif()
|
||||||
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
elseif (CMAKE_C_COMPILER_ID STREQUAL "GNU")
|
||||||
set(c_flags ${c_flags} -Wdouble-promotion)
|
set(c_flags ${c_flags} -Wdouble-promotion)
|
||||||
set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds)
|
set(host_cxx_flags ${host_cxx_flags} -Wno-array-bounds -fpermissive)
|
||||||
|
|
||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
|
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.1.0)
|
||||||
set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation)
|
set(host_cxx_flags ${host_cxx_flags} -Wno-format-truncation -fpermissive)
|
||||||
endif()
|
endif()
|
||||||
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
|
if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 8.1.0)
|
||||||
set(host_cxx_flags ${host_cxx_flags} -Wextra-semi)
|
set(host_cxx_flags ${host_cxx_flags} -Wextra-semi -fpermissive)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
# todo : msvc
|
# todo : msvc
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(c_flags ${c_flags} ${warning_flags})
|
set(c_flags ${c_flags} -save-temps --verbose ${warning_flags})
|
||||||
set(cxx_flags ${cxx_flags} ${warning_flags})
|
set(cxx_flags ${cxx_flags} -fpermissive -save-temps --verbose ${warning_flags})
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
||||||
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
|
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
|
||||||
"$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
|
"$<$<COMPILE_LANGUAGE:CXX>:${host_cxx_flags}>")
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT MSVC)
|
|
||||||
set(cuda_flags -Wno-pedantic)
|
|
||||||
endif()
|
|
||||||
set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
|
set(cuda_flags ${cxx_flags} -use_fast_math ${cuda_flags})
|
||||||
|
|
||||||
list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument
|
list(JOIN host_cxx_flags " " cuda_host_flags) # pass host compiler flags as a single argument
|
||||||
|
@ -438,6 +468,9 @@ if (NOT cuda_host_flags STREQUAL "")
|
||||||
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
|
set(cuda_flags ${cuda_flags} -Xcompiler ${cuda_host_flags})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
set(cuda_flags --verbose -G ${cuda_flags})
|
||||||
|
|
||||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
|
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${cuda_flags}>")
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
|
@ -485,6 +518,8 @@ if (NOT MSVC)
|
||||||
add_link_options(-static-libgcc -static-libstdc++)
|
add_link_options(-static-libgcc -static-libstdc++)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
add_link_options("-Wl,-Map=${TARGET}.map")
|
||||||
|
|
||||||
if (LLAMA_GPROF)
|
if (LLAMA_GPROF)
|
||||||
add_compile_options(-pg)
|
add_compile_options(-pg)
|
||||||
endif()
|
endif()
|
||||||
|
@ -645,13 +680,13 @@ if (GGML_USE_CPU_HBM)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_library(ggml OBJECT
|
add_library(ggml OBJECT
|
||||||
ggml.c
|
ggml.cpp
|
||||||
ggml.h
|
ggml.h
|
||||||
ggml-alloc.c
|
ggml-alloc.cpp
|
||||||
ggml-alloc.h
|
ggml-alloc.h
|
||||||
ggml-backend.c
|
ggml-backend.cpp
|
||||||
ggml-backend.h
|
ggml-backend.h
|
||||||
ggml-quants.c
|
ggml-quants.cpp
|
||||||
ggml-quants.h
|
ggml-quants.h
|
||||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
|
||||||
|
|
20
Makefile
20
Makefile
|
@ -116,7 +116,7 @@ endif
|
||||||
# keep standard at C11 and C++11
|
# keep standard at C11 and C++11
|
||||||
MK_CPPFLAGS = -I. -Icommon
|
MK_CPPFLAGS = -I. -Icommon
|
||||||
MK_CFLAGS = -std=c11 -fPIC
|
MK_CFLAGS = -std=c11 -fPIC
|
||||||
MK_CXXFLAGS = -std=c++11 -fPIC
|
MK_CXXFLAGS = -std=c++17 -fPIC -fpermissive
|
||||||
|
|
||||||
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
# -Ofast tends to produce faster code, but may not be available for some compilers.
|
||||||
ifdef LLAMA_FAST
|
ifdef LLAMA_FAST
|
||||||
|
@ -502,7 +502,7 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
|
||||||
endif # LLAMA_METAL
|
endif # LLAMA_METAL
|
||||||
|
|
||||||
ifdef LLAMA_MPI
|
ifdef LLAMA_MPI
|
||||||
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
|
ggml-mpi.o: ggml-mpi.cpp ggml-mpi.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
endif # LLAMA_MPI
|
endif # LLAMA_MPI
|
||||||
|
|
||||||
|
@ -537,17 +537,17 @@ $(info )
|
||||||
# Build library
|
# Build library
|
||||||
#
|
#
|
||||||
|
|
||||||
ggml.o: ggml.c ggml.h ggml-cuda.h
|
ggml.o: ggml.cpp ggml.h ggml-cuda.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
|
ggml-alloc.o: ggml-alloc.cpp ggml.h ggml-alloc.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
|
ggml-backend.o: ggml-backend.cpp ggml.h ggml-backend.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
|
ggml-quants.o: ggml-quants.cpp ggml.h ggml-quants.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
|
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
|
||||||
|
|
||||||
|
|
|
@ -696,7 +696,7 @@ PROMPT_TEMPLATE=./prompts/chat-with-bob.txt PROMPT_CACHE_FILE=bob.prompt.bin \
|
||||||
|
|
||||||
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
The `grammars/` folder contains a handful of sample grammars. To write your own, check out the [GBNF Guide](./grammars/README.md).
|
||||||
|
|
||||||
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets ygou write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
|
||||||
|
|
||||||
### Instruction mode with Alpaca
|
### Instruction mode with Alpaca
|
||||||
|
|
||||||
|
|
1039
README.org
Normal file
1039
README.org
Normal file
File diff suppressed because it is too large
Load diff
|
@ -659,7 +659,7 @@ int main(int argc, char ** argv) {
|
||||||
if (input_echo) {
|
if (input_echo) {
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id);
|
||||||
printf("%s", token_str.c_str());
|
printf("TOKEN:%s\n", token_str.c_str());
|
||||||
|
|
||||||
if (embd.size() > 1) {
|
if (embd.size() > 1) {
|
||||||
input_tokens.push_back(id);
|
input_tokens.push_back(id);
|
||||||
|
@ -850,6 +850,9 @@ int main(int argc, char ** argv) {
|
||||||
llama_print_timings(ctx);
|
llama_print_timings(ctx);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
|
// dont dump core
|
||||||
|
//int *ptr = 0; *ptr = 1;
|
||||||
|
|
||||||
if (ctx_guidance) { llama_free(ctx_guidance); }
|
if (ctx_guidance) { llama_free(ctx_guidance); }
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
16
ggml-cuda.cu
16
ggml-cuda.cu
|
@ -7623,12 +7623,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// debug helpers
|
// debug helpers
|
||||||
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
|
// printf("JSON: { \"data\":{ \"src0\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ], \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"src1\": { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ], \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}, \"dst\" : { \"%s\" :{ \"ne\" : [ %8d, %8d, %8d, %8d ], \"nb\" : [ %8d, %8d, %8d, %8d ], \"contiguous\":\"%d\", \"transposed\":\"%d\", \"type\": \"%s\", \"name\" : \"%s\"}}}}\n",
|
||||||
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
|
// src0->name, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
|
||||||
//printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
|
// ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name,
|
||||||
//printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
|
// src1->name, src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name,
|
||||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
// dst->name, dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], ggml_is_contiguous(dst), ggml_is_transposed(dst), ggml_type_name(dst->type), dst->name
|
||||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
// );
|
||||||
|
|
||||||
if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||||
// KQ single-batch
|
// KQ single-batch
|
||||||
|
@ -8056,9 +8056,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
||||||
|
|
||||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||||
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
||||||
#ifndef NDEBUG
|
|
||||||
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %d, src1->ne[3] = %d - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
|
||||||
#endif
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,7 @@ extern "C" {
|
||||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||||
#else
|
#else
|
||||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
//#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_
|
||||||
|
|
||||||
|
|
||||||
// Quantization
|
// Quantization
|
||||||
void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
|
void quantize_row_q4_0_reference(const float * __restrict__ x, block_q4_0 * __restrict__ y, int k);
|
||||||
void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
|
void quantize_row_q4_1_reference(const float * __restrict__ x, block_q4_1 * __restrict__ y, int k);
|
||||||
void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
|
void quantize_row_q5_0_reference(const float * __restrict__ x, block_q5_0 * __restrict__ y, int k);
|
||||||
void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
|
void quantize_row_q5_1_reference(const float * __restrict__ x, block_q5_1 * __restrict__ y, int k);
|
||||||
void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
|
void quantize_row_q8_0_reference(const float * __restrict__ x, block_q8_0 * __restrict__ y, int k);
|
||||||
void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
|
void quantize_row_q8_1_reference(const float * __restrict__ x, block_q8_1 * __restrict__ y, int k);
|
||||||
|
|
||||||
void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
|
void quantize_row_q2_K_reference(const float * __restrict__ x, block_q2_K * __restrict__ y, int k);
|
||||||
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
|
void quantize_row_q3_K_reference(const float * __restrict__ x, block_q3_K * __restrict__ y, int k);
|
||||||
void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
|
void quantize_row_q4_K_reference(const float * __restrict__ x, block_q4_K * __restrict__ y, int k);
|
||||||
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
void quantize_row_q5_K_reference(const float * __restrict__ x, block_q5_K * __restrict__ y, int k);
|
||||||
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
void quantize_row_q6_K_reference(const float * __restrict__ x, block_q6_K * __restrict__ y, int k);
|
||||||
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
void quantize_row_q8_K_reference(const float * __restrict__ x, block_q8_K * __restrict__ y, int k);
|
||||||
|
|
||||||
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q4_0(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q4_1(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q5_0(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q5_1(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q8_0(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q8_1(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
|
|
||||||
void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q2_K(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q3_K(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q4_K(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q5_K(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q6_K(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
void quantize_row_q8_K(const float * __restrict__ x, void * __restrict__ y, int k);
|
||||||
|
|
||||||
// Dequantization
|
// Dequantization
|
||||||
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
void dequantize_row_q4_0(const block_q4_0 * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
|
void dequantize_row_q4_1(const block_q4_1 * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
|
void dequantize_row_q5_0(const block_q5_0 * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
|
void dequantize_row_q5_1(const block_q5_1 * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
|
void dequantize_row_q8_0(const block_q8_0 * __restrict__ x, float * __restrict__ y, int k);
|
||||||
//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
|
//void dequantize_row_q8_1(const block_q8_1 * __restrict__ x, float * __restrict__ y, int k);
|
||||||
|
|
||||||
void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q2_K(const block_q2_K * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q3_K(const block_q3_K * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q4_K(const block_q4_K * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q5_K(const block_q5_K * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q6_K(const block_q6_K * __restrict__ x, float * __restrict__ y, int k);
|
||||||
void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
|
void dequantize_row_q8_K(const block_q8_K * __restrict__ x, float * __restrict__ y, int k);
|
||||||
|
|
||||||
// Dot product
|
// Dot product
|
||||||
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q4_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q4_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q5_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q5_1_q8_1(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q8_0_q8_0(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
|
|
||||||
void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q2_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q3_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q4_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q5_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
void ggml_vec_dot_q6_K_q8_K(int n, float * __restrict__ s, const void * __restrict__ vx, const void * __restrict__ vy);
|
||||||
|
|
1227
ggml.c → ggml.cpp
1227
ggml.c → ggml.cpp
File diff suppressed because it is too large
Load diff
6
ggml.h
6
ggml.h
|
@ -571,6 +571,12 @@ extern "C" {
|
||||||
size_t offs;
|
size_t offs;
|
||||||
size_t size;
|
size_t size;
|
||||||
void * data;
|
void * data;
|
||||||
|
|
||||||
|
ggml_scratch()
|
||||||
|
: offs(0),
|
||||||
|
size(0),
|
||||||
|
data(0)
|
||||||
|
{}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_init_params {
|
struct ggml_init_params {
|
||||||
|
|
30
llama.cpp
30
llama.cpp
|
@ -1494,6 +1494,7 @@ static bool llama_kv_cache_init(
|
||||||
ggml_type wtype,
|
ggml_type wtype,
|
||||||
uint32_t n_ctx,
|
uint32_t n_ctx,
|
||||||
int n_gpu_layers) {
|
int n_gpu_layers) {
|
||||||
|
fprintf(stderr, "GPULAYERS '%d'\n", n_gpu_layers);
|
||||||
const uint32_t n_embd = hparams.n_embd_gqa();
|
const uint32_t n_embd = hparams.n_embd_gqa();
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
|
||||||
|
@ -1531,6 +1532,7 @@ static bool llama_kv_cache_init(
|
||||||
(void) n_gpu_layers;
|
(void) n_gpu_layers;
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
fprintf(stderr, "USE CUBLAS\n");
|
||||||
if (ggml_cublas_loaded()) {
|
if (ggml_cublas_loaded()) {
|
||||||
size_t vram_kv_cache = 0;
|
size_t vram_kv_cache = 0;
|
||||||
|
|
||||||
|
@ -1548,6 +1550,8 @@ static bool llama_kv_cache_init(
|
||||||
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
fprintf(stderr, "NO USE CUBLAS\n");
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -2065,6 +2069,7 @@ struct llama_model_loader {
|
||||||
break;
|
break;
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
case GGML_BACKEND_GPU:
|
case GGML_BACKEND_GPU:
|
||||||
|
|
||||||
case GGML_BACKEND_GPU_SPLIT:
|
case GGML_BACKEND_GPU_SPLIT:
|
||||||
// old code:
|
// old code:
|
||||||
//ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
|
//ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
|
||||||
|
@ -2741,9 +2746,11 @@ static void llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
|
||||||
if (backend_norm == GGML_BACKEND_GPU) {
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights00 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output_norm);
|
vram_weights += ggml_nbytes(model.output_norm);
|
||||||
}
|
}
|
||||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
fprintf(stderr, "vram_weights01 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output);
|
vram_weights += ggml_nbytes(model.output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2774,6 +2781,7 @@ static void llm_load_tensors(
|
||||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights03 '%ld'\n", vram_weights);
|
||||||
vram_weights +=
|
vram_weights +=
|
||||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||||
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
||||||
|
@ -2807,9 +2815,11 @@ static void llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
|
||||||
if (backend_norm == GGML_BACKEND_GPU) {
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights04 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output_norm);
|
vram_weights += ggml_nbytes(model.output_norm);
|
||||||
}
|
}
|
||||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
fprintf(stderr, "vram_weights05 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output);
|
vram_weights += ggml_nbytes(model.output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2840,6 +2850,7 @@ static void llm_load_tensors(
|
||||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights06 '%ld'\n", vram_weights);
|
||||||
vram_weights +=
|
vram_weights +=
|
||||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
||||||
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
|
||||||
|
@ -2878,10 +2889,13 @@ static void llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
|
||||||
if (backend_norm == GGML_BACKEND_GPU) {
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights07 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output_norm);
|
vram_weights += ggml_nbytes(model.output_norm);
|
||||||
|
fprintf(stderr, "vram_weights08 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||||
}
|
}
|
||||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
fprintf(stderr, "vram_weights09 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output);
|
vram_weights += ggml_nbytes(model.output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2906,7 +2920,9 @@ static void llm_load_tensors(
|
||||||
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
|
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights10 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(layer.attn_norm_2);
|
vram_weights += ggml_nbytes(layer.attn_norm_2);
|
||||||
|
fprintf(stderr, "vram_weights11 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
|
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2918,6 +2934,7 @@ static void llm_load_tensors(
|
||||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights12 '%ld'\n", vram_weights);
|
||||||
vram_weights +=
|
vram_weights +=
|
||||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
||||||
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
|
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
|
||||||
|
@ -2955,10 +2972,12 @@ static void llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
|
||||||
if (backend_norm == GGML_BACKEND_GPU) {
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights13 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output_norm);
|
vram_weights += ggml_nbytes(model.output_norm);
|
||||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||||
}
|
}
|
||||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
fprintf(stderr, "vram_weights14 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output);
|
vram_weights += ggml_nbytes(model.output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2994,6 +3013,7 @@ static void llm_load_tensors(
|
||||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights15 '%ld'\n", vram_weights);
|
||||||
vram_weights +=
|
vram_weights +=
|
||||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
||||||
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
||||||
|
@ -3039,10 +3059,13 @@ static void llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
|
||||||
if (backend_norm == GGML_BACKEND_GPU) {
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights16 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output_norm);
|
vram_weights += ggml_nbytes(model.output_norm);
|
||||||
|
fprintf(stderr, "vram_weights17 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||||
}
|
}
|
||||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
fprintf(stderr, "vram_weights18 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output);
|
vram_weights += ggml_nbytes(model.output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3105,10 +3128,13 @@ static void llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
|
||||||
if (backend_norm == GGML_BACKEND_GPU) {
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights19 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output_norm);
|
vram_weights += ggml_nbytes(model.output_norm);
|
||||||
|
fprintf(stderr, "vram_weights20 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||||
}
|
}
|
||||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
fprintf(stderr, "vram_weights21 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output);
|
vram_weights += ggml_nbytes(model.output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3144,6 +3170,7 @@ static void llm_load_tensors(
|
||||||
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights22 '%ld'\n", vram_weights);
|
||||||
vram_weights +=
|
vram_weights +=
|
||||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
||||||
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
||||||
|
@ -3182,9 +3209,11 @@ static void llm_load_tensors(
|
||||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
|
||||||
if (backend_norm == GGML_BACKEND_GPU) {
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights23 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output_norm);
|
vram_weights += ggml_nbytes(model.output_norm);
|
||||||
}
|
}
|
||||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
fprintf(stderr, "vram_weights24 '%ld'\n", vram_weights);
|
||||||
vram_weights += ggml_nbytes(model.output);
|
vram_weights += ggml_nbytes(model.output);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3211,6 +3240,7 @@ static void llm_load_tensors(
|
||||||
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
|
||||||
if (backend == GGML_BACKEND_GPU) {
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
fprintf(stderr, "vram_weights25 '%ld'\n", vram_weights);
|
||||||
vram_weights +=
|
vram_weights +=
|
||||||
ggml_nbytes(layer.attn_norm) +
|
ggml_nbytes(layer.attn_norm) +
|
||||||
ggml_nbytes(layer.wqkv) +
|
ggml_nbytes(layer.wqkv) +
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue