Merge branch 'master' into minimalist_example

This commit is contained in:
Georgi Gerganov 2023-06-16 21:57:40 +03:00 committed by GitHub
commit 6804100fe2
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
36 changed files with 7369 additions and 1081 deletions

3
.gitignore vendored
View file

@ -22,6 +22,7 @@ build-metal/
build-no-accel/ build-no-accel/
build-sanitize-addr/ build-sanitize-addr/
build-sanitize-thread/ build-sanitize-thread/
out/
models/* models/*
*.bin *.bin
@ -32,6 +33,7 @@ models/*
/result /result
/perplexity /perplexity
/embedding /embedding
/train-text-from-scratch
/benchmark-matmult /benchmark-matmult
/vdot /vdot
/Pipfile /Pipfile
@ -40,6 +42,7 @@ models/*
build-info.h build-info.h
arm_neon.h arm_neon.h
compile_commands.json compile_commands.json
CMakeSettings.json
__pycache__ __pycache__

View file

@ -70,6 +70,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF) option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels") set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF) option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_K_QUANTS "llama: use k-quants" ON) option(LLAMA_K_QUANTS "llama: use k-quants" ON)
@ -158,17 +159,64 @@ if (LLAMA_BLAS)
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22) if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
set(BLA_SIZEOF_INTEGER 8) set(BLA_SIZEOF_INTEGER 8)
endif() endif()
set(BLA_VENDOR ${LLAMA_BLAS_VENDOR}) set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
find_package(BLAS) find_package(BLAS)
if (BLAS_FOUND) if (BLAS_FOUND)
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
find_package(PkgConfig REQUIRED)
if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
pkg_check_modules(DepBLAS REQUIRED blas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
pkg_check_modules(DepBLAS REQUIRED openblas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
pkg_check_modules(DepBLAS REQUIRED blis)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
pkg_check_modules(DepBLAS REQUIRED blas-atlas)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
# all Intel* libraries share the same include path
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
# this doesn't provide pkg-config
# suggest to assign BLAS_INCLUDE_DIRS on your own
if ("${NVHPC_VERSION}" STREQUAL "")
message(WARNING "Better to set NVHPC_VERSION")
else()
set(DepBLAS_FOUND ON)
set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
endif()
endif()
if (DepBLAS_FOUND)
set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
else()
message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
" detected by pkgconfig, trying to find cblas.h from possible paths...")
find_path(BLAS_INCLUDE_DIRS
NAMES cblas.h
HINTS
/usr/include
/usr/local/include
/usr/include/openblas
/opt/homebrew/opt/openblas/include
/usr/local/opt/openblas/include
/usr/include/x86_64-linux-gnu/openblas/include
)
endif()
endif()
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_options(${BLAS_LINKER_FLAGS}) add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_OPENBLAS) add_compile_definitions(GGML_USE_OPENBLAS)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
message("${BLAS_LIBRARIES} ${BLAS_INCLUDE_DIRS}")
include_directories(${BLAS_INCLUDE_DIRS})
else() else()
message(WARNING "BLAS not found, please refer to " message(WARNING "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@ -190,6 +238,7 @@ if (LLAMA_CUBLAS)
add_compile_definitions(GGML_USE_CUBLAS) add_compile_definitions(GGML_USE_CUBLAS)
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y}) add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
if (LLAMA_STATIC) if (LLAMA_STATIC)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@ -408,7 +457,7 @@ add_library(ggml OBJECT
${GGML_SOURCES_EXTRA} ${GGML_SOURCES_EXTRA}
) )
target_include_directories(ggml PUBLIC .) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
target_compile_features(ggml PUBLIC c_std_11) # don't bump target_compile_features(ggml PUBLIC c_std_11) # don't bump
target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS}) target_link_libraries(ggml PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})

View file

@ -1,5 +1,5 @@
# Define the default target now so that it is always the first target # Define the default target now so that it is always the first target
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot simple BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple
ifdef LLAMA_BUILD_SERVER ifdef LLAMA_BUILD_SERVER
BUILD_TARGETS += server BUILD_TARGETS += server
@ -171,6 +171,11 @@ ifdef LLAMA_CUDA_DMMV_Y
else else
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1 NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
endif # LLAMA_CUDA_DMMV_Y endif # LLAMA_CUDA_DMMV_Y
ifdef LLAMA_CUDA_KQUANTS_ITER
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
else
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif # LLAMA_CUBLAS endif # LLAMA_CUBLAS
@ -259,7 +264,7 @@ libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
clean: clean:
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot build-info.h rm -vf *.o *.so main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server vdot train-text-from-scratch build-info.h
# #
# Examples # Examples
@ -295,6 +300,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS) server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS)
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
build-info.h: $(wildcard .git/index) scripts/build-info.sh build-info.h: $(wildcard .git/index) scripts/build-info.sh
@sh scripts/build-info.sh > $@.tmp @sh scripts/build-info.sh > $@.tmp
@if ! cmp -s $@.tmp $@; then \ @if ! cmp -s $@.tmp $@; then \

View file

@ -11,6 +11,7 @@ let package = Package(
.target( .target(
name: "llama", name: "llama",
path: ".", path: ".",
exclude: ["ggml-metal.metal"],
sources: ["ggml.c", "llama.cpp"], sources: ["ggml.c", "llama.cpp"],
publicHeadersPath: "spm-headers", publicHeadersPath: "spm-headers",
cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")], cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],

View file

@ -37,6 +37,7 @@ else()
add_subdirectory(save-load-state) add_subdirectory(save-load-state)
add_subdirectory(benchmark) add_subdirectory(benchmark)
add_subdirectory(baby-llama) add_subdirectory(baby-llama)
add_subdirectory(train-text-from-scratch)
if (LLAMA_METAL) if (LLAMA_METAL)
add_subdirectory(metal) add_subdirectory(metal)
endif() endif()

View file

@ -4,6 +4,10 @@
#include <random> #include <random>
#include <cstring> #include <cstring>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
float frand() { float frand() {
return (float)rand()/(float)RAND_MAX; return (float)rand()/(float)RAND_MAX;
} }
@ -79,34 +83,39 @@ struct ggml_tensor * randomize_tensor_normal(
int ndims, int ndims,
const int64_t ne[], const int64_t ne[],
struct random_normal_distribution * rnd) { struct random_normal_distribution * rnd) {
float scale = 1.0; // xavier
switch (ndims) { switch (ndims) {
case 1: case 1:
scale /= sqrtf(ne[0]);
for (int i0 = 0; i0 < ne[0]; i0++) { for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)tensor->data)[i0] = frand_normal(rnd); ((float *)tensor->data)[i0] = scale * frand_normal(rnd);
} }
break; break;
case 2: case 2:
scale /= sqrtf(ne[0]+ne[1]);
for (int i1 = 0; i1 < ne[1]; i1++) { for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) { for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)tensor->data)[i1*ne[0] + i0] = frand_normal(rnd); ((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
} }
} }
break; break;
case 3: case 3:
scale /= sqrtf(ne[0]+ne[1]);
for (int i2 = 0; i2 < ne[2]; i2++) { for (int i2 = 0; i2 < ne[2]; i2++) {
for (int i1 = 0; i1 < ne[1]; i1++) { for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) { for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd); ((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
} }
} }
} }
break; break;
case 4: case 4:
scale /= sqrtf(ne[0]+ne[1]);
for (int i3 = 0; i3 < ne[3]; i3++) { for (int i3 = 0; i3 < ne[3]; i3++) {
for (int i2 = 0; i2 < ne[2]; i2++) { for (int i2 = 0; i2 < ne[2]; i2++) {
for (int i1 = 0; i1 < ne[1]; i1++) { for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) { for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd); ((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
} }
} }
} }
@ -148,8 +157,8 @@ struct llama_hparams_lora {
uint32_t n_rot = 64; uint32_t n_rot = 64;
uint32_t n_lora = 64; uint32_t n_lora = 64;
bool operator!=(const llama_hparams & other) const { bool operator!=(const llama_hparams_lora & other) const {
return memcmp(this, &other, sizeof(llama_hparams)); return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
} }
}; };
@ -1465,7 +1474,7 @@ struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_te
} }
struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
const float eps = 1e-3; const float eps = 1e-3f;
return return
ggml_sum(ctx, ggml_sum(ctx,
ggml_neg(ctx, ggml_neg(ctx,

View file

@ -16,6 +16,10 @@
#include <iterator> #include <iterator>
#include <algorithm> #include <algorithm>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
float tensor_sum_elements(const ggml_tensor * tensor) { float tensor_sum_elements(const ggml_tensor * tensor) {
float sum = 0; float sum = 0;
if (tensor->type==GGML_TYPE_F32) { if (tensor->type==GGML_TYPE_F32) {
@ -29,9 +33,9 @@ float tensor_sum_elements(const ggml_tensor * tensor) {
} }
void tensor_dump(const ggml_tensor * tensor, const char * name) { void tensor_dump(const ggml_tensor * tensor, const char * name) {
printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", name, printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
tensor->type, ggml_type_name(tensor->type), tensor->type, ggml_type_name(tensor->type),
(int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
float sum = tensor_sum_elements(tensor); float sum = tensor_sum_elements(tensor);
printf("Sum of tensor %s is %6.2f\n", name, sum); printf("Sum of tensor %s is %6.2f\n", name, sum);
} }
@ -120,7 +124,7 @@ int main(int argc, char ** argv) {
ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS
ctx_size += 1024*1024*16; ctx_size += 1024*1024*16;
printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024)); printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
struct ggml_init_params params = { struct ggml_init_params params = {
/*.mem_size =*/ ctx_size, /*.mem_size =*/ ctx_size,

41
examples/chat-vicuna.sh Executable file
View file

@ -0,0 +1,41 @@
#!/bin/bash
set -e
cd "$(dirname "$0")/.." || exit
MODEL="${MODEL:-./models/ggml-vic13b-uncensored-q5_0.bin}"
PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
USER_NAME="### Human"
AI_NAME="### Assistant"
# Adjust to the number of CPU cores you want to use.
N_THREAD="${N_THREAD:-8}"
# Number of tokens to predict (made it larger than default because we want a long interaction)
N_PREDICTS="${N_PREDICTS:-2048}"
# Note: you can also override the generation options by specifying them on the command line:
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
DATE_TIME=$(date +%H:%M)
DATE_YEAR=$(date +%Y)
PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
-e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
-e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
-e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
$PROMPT_TEMPLATE > $PROMPT_FILE
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
./bin/main $GEN_OPTIONS \
--model "$MODEL" \
--threads "$N_THREAD" \
--n_predict "$N_PREDICTS" \
--color --interactive \
--file ${PROMPT_FILE} \
--reverse-prompt "### Human:" \
--in-prefix ' ' \
"$@"

View file

@ -28,6 +28,10 @@
#include <wchar.h> #include <wchar.h>
#endif #endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
int32_t get_num_physical_cores() { int32_t get_num_physical_cores() {
#ifdef __linux__ #ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores // enumerate the set of thread siblings, num entries is num cores
@ -331,6 +335,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} }
#else #else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
} else if (arg == "--low-vram" || arg == "-lv") {
#ifdef GGML_USE_CUBLAS
params.low_vram = true;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
} else if (arg == "--no-mmap") { } else if (arg == "--no-mmap") {
params.use_mmap = false; params.use_mmap = false;
@ -367,7 +377,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} else { } else {
throw std::exception(); throw std::exception();
} }
} catch (const std::exception &e) { } catch (const std::exception&) {
invalid_param = true; invalid_param = true;
break; break;
} }
@ -406,6 +416,14 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
gpt_print_usage(argc, argv, default_params); gpt_print_usage(argc, argv, default_params);
exit(1); exit(1);
} }
#ifdef GGML_USE_CUBLAS
if (!params.lora_adapter.empty() && params.n_gpu_layers > 0) {
fprintf(stderr, "%s: error: the simultaneous use of LoRAs and GPU acceleration is not supported", __func__);
exit(1);
}
#endif // GGML_USE_CUBLAS
if (escape_prompt) { if (escape_prompt) {
process_escapes(params.prompt); process_escapes(params.prompt);
} }
@ -479,6 +497,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n");
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
#endif #endif
fprintf(stderr, " --mtest compute maximum memory usage\n"); fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
@ -528,6 +547,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
lparams.n_gpu_layers = params.n_gpu_layers; lparams.n_gpu_layers = params.n_gpu_layers;
lparams.main_gpu = params.main_gpu; lparams.main_gpu = params.main_gpu;
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float)); memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
lparams.low_vram = params.low_vram;
lparams.seed = params.seed; lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16; lparams.f16_kv = params.memory_f16;
lparams.use_mmap = params.use_mmap; lparams.use_mmap = params.use_mmap;

View file

@ -30,6 +30,7 @@ struct gpt_params {
int32_t n_gpu_layers = 0; // number of layers to store in VRAM int32_t n_gpu_layers = 0; // number of layers to store in VRAM
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
bool low_vram = 0; // if true, reduce VRAM usage at the cost of performance
// sampling parameters // sampling parameters
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens

View file

@ -4,6 +4,10 @@
#include <ctime> #include <ctime>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
int main(int argc, char ** argv) { int main(int argc, char ** argv) {
gpt_params params; gpt_params params;

View file

@ -288,5 +288,6 @@ These options provide extra functionality and customization when running the LLa
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.

View file

@ -23,11 +23,17 @@
#include <unistd.h> #include <unistd.h>
#elif defined (_WIN32) #elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX #define NOMINMAX
#endif
#include <windows.h> #include <windows.h>
#include <signal.h> #include <signal.h>
#endif #endif
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static console_state con_st; static console_state con_st;
static llama_context ** g_ctx; static llama_context ** g_ctx;
@ -331,6 +337,13 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd; std::vector<llama_token> embd;
// do one empty run to warm up the model
{
const std::vector<llama_token> tmp = { llama_token_bos(), };
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
llama_reset_timings(ctx);
}
while ((n_remain != 0 && !is_antiprompt) || params.interactive) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict // predict
if (embd.size() > 0) { if (embd.size() > 0) {
@ -341,7 +354,7 @@ int main(int argc, char ** argv) {
if ((int)embd.size() > max_embd_size) { if ((int)embd.size() > max_embd_size) {
auto skipped_tokens = embd.size() - max_embd_size; auto skipped_tokens = embd.size() - max_embd_size;
console_set_color(con_st, CONSOLE_COLOR_ERROR); console_set_color(con_st, CONSOLE_COLOR_ERROR);
printf("<<input too long: skipped %ld token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); printf("<<input too long: skipped %" PRIu64 " token%s>>", skipped_tokens, skipped_tokens != 1 ? "s" : "");
console_set_color(con_st, CONSOLE_COLOR_DEFAULT); console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
fflush(stdout); fflush(stdout);
embd.resize(max_embd_size); embd.resize(max_embd_size);

View file

@ -5,6 +5,10 @@
#include <cmath> #include <cmath>
#include <ctime> #include <ctime>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
std::vector<float> softmax(const std::vector<float>& logits) { std::vector<float> softmax(const std::vector<float>& logits) {
std::vector<float> probs(logits.size()); std::vector<float> probs(logits.size());
float max_logit = logits[0]; float max_logit = logits[0];

View file

@ -19,6 +19,10 @@
#include <thread> #include <thread>
#include <mutex> #include <mutex>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
struct quantize_stats_params { struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.bin"; std::string model = "models/7B/ggml-model-f16.bin";
bool verbose = false; bool verbose = false;

View file

@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
// init // init
auto ctx = llama_init_from_file(params.model.c_str(), lparams); auto ctx = llama_init_from_file(params.model.c_str(), lparams);
auto tokens = std::vector<llama_token>(params.n_ctx); auto tokens = std::vector<llama_token>(params.n_ctx);
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true); auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), int(tokens.size()), true);
if (n_prompt_tokens < 1) { if (n_prompt_tokens < 1) {
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__); fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);

View file

@ -16,6 +16,10 @@ This example allow you to have a llama.cpp http server to interact from a web pa
To get started right away, run the following command, making sure to use the correct path for the model you have: To get started right away, run the following command, making sure to use the correct path for the model you have:
#### Unix-based systems (Linux, macOS, etc.): #### Unix-based systems (Linux, macOS, etc.):
Make sure to build with the server option on
```bash
LLAMA_BUILD_SERVER=1 make
```
```bash ```bash
./server -m models/7B/ggml-model.bin --ctx_size 2048 ./server -m models/7B/ggml-model.bin --ctx_size 2048
@ -289,6 +293,7 @@ Test();
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. - `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. - `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
- `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**. - `--embedding`: Enable the embedding mode. **Completion function doesn't work in this mode**.
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`; - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`;
- `--port`: Set the port to listen. Default: `8080`. - `--port`: Set the port to listen. Default: `8080`.

View file

@ -405,6 +405,7 @@ void server_print_usage(int /*argc*/, char **argv, const gpt_params &params)
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" );
fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" );
#endif #endif
fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
@ -537,6 +538,14 @@ bool server_params_parse(int argc, char **argv, server_params &sparams, gpt_para
} }
#else #else
fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); fprintf(stderr, "WARNING: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n");
#endif // GGML_USE_CUBLAS
}
else if (arg == "--low-vram" || arg == "-lv")
{
#ifdef GGML_USE_CUBLAS
params.low_vram = true;
#else
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n");
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
} }
else if (arg == "--main-gpu" || arg == "-mg") else if (arg == "--main-gpu" || arg == "-mg")

View file

@ -0,0 +1,4 @@
set(TARGET train-text-from-scratch)
add_executable(${TARGET} train-text-from-scratch.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,22 @@
# train-text-from-scratch
Basic usage instructions:
```bash
# get training data
wget https://github.com/brunoklein99/deep-learning-notes/blob/master/shakespeare.txt
# train
./bin/train-text-from-scratch \
--vocab-model ../models/ggml-vocab.bin \
--ctx 64 --embd 256 --head 8 --layer 16 \
--checkpoint-in chk-shakespeare-256x16.bin \
--checkpoint-out chk-shakespeare-256x16.bin \
--model-out ggml-shakespeare-256x16-f32.bin \
--train-data "shakespeare.txt" \
-t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
--print-details-interval 0 --predict 16 --use-flash
# predict
./bin/main -m ggml-shakespeare-256x16-f32.bin
```

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -28,8 +28,10 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
void ggml_cuda_free_data(struct ggml_tensor * tensor); void ggml_cuda_free_data(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
void ggml_cuda_set_main_device(int main_device); void ggml_cuda_set_main_device(int main_device);
void ggml_cuda_set_scratch_size(size_t scratch_size); void ggml_cuda_set_scratch_size(size_t scratch_size);
void ggml_cuda_free_scratch(void);
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef __cplusplus #ifdef __cplusplus

View file

@ -55,6 +55,7 @@ void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor *
void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t); void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
// same as ggml_graph_compute but uses Metal // same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
#ifdef __cplusplus #ifdef __cplusplus

View file

@ -287,15 +287,40 @@ void ggml_metal_graph_compute(
struct ggml_cgraph * gf) { struct ggml_cgraph * gf) {
metal_printf("%s: evaluating graph\n", __func__); metal_printf("%s: evaluating graph\n", __func__);
// create multiple command buffers and enqueue them
// then, we encode the graph into the command buffers in parallel
const int n_cb = gf->n_threads;
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
for (int i = 0; i < n_cb; ++i) {
command_buffers[i] = [ctx->queue commandBuffer];
// enqueue the command buffers in order to specify their execution order
[command_buffers[i] enqueue];
}
// TODO: is this the best way to start threads?
dispatch_queue_t queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
const int n_nodes_per_cb = (gf->n_nodes + n_cb - 1) / n_cb;
dispatch_async(queue, ^{
size_t offs_src0 = 0; size_t offs_src0 = 0;
size_t offs_src1 = 0; size_t offs_src1 = 0;
size_t offs_dst = 0; size_t offs_dst = 0;
id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer]; id<MTLCommandBuffer> command_buffer = command_buffers[cb_idx];
id<MTLComputeCommandEncoder> encoder = nil; id<MTLComputeCommandEncoder> encoder = nil;
for (int i = 0; i < gf->n_nodes; ++i) { const int node_start = (cb_idx + 0) * n_nodes_per_cb;
//metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); const int node_end = (cb_idx == n_cb - 1) ? gf->n_nodes : (cb_idx + 1) * n_nodes_per_cb;
for (int i = node_start; i < node_end; ++i) {
metal_printf("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
struct ggml_tensor * src0 = gf->nodes[i]->src0; struct ggml_tensor * src0 = gf->nodes[i]->src0;
struct ggml_tensor * src1 = gf->nodes[i]->src1; struct ggml_tensor * src1 = gf->nodes[i]->src1;
@ -626,7 +651,6 @@ void ggml_metal_graph_compute(
} }
}; };
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
@ -800,12 +824,11 @@ void ggml_metal_graph_compute(
} }
[command_buffer commit]; [command_buffer commit];
[command_buffer waitUntilCompleted]; });
{
const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
UNUSED(time_elapsed);
metal_printf("%s: time elapsed = %f ms\n", __func__, time_elapsed * 1000.0);
} }
// wait for all threads to finish
dispatch_barrier_sync(queue, ^{});
[command_buffers[n_cb - 1] waitUntilCompleted];
} }

2099
ggml.c

File diff suppressed because it is too large Load diff

128
ggml.h
View file

@ -296,6 +296,7 @@ extern "C" {
GGML_OP_SUM_ROWS, GGML_OP_SUM_ROWS,
GGML_OP_MEAN, GGML_OP_MEAN,
GGML_OP_REPEAT, GGML_OP_REPEAT,
GGML_OP_REPEAT_BACK,
GGML_OP_ABS, GGML_OP_ABS,
GGML_OP_SGN, GGML_OP_SGN,
GGML_OP_NEG, GGML_OP_NEG,
@ -309,6 +310,7 @@ extern "C" {
GGML_OP_RMS_NORM_BACK, GGML_OP_RMS_NORM_BACK,
GGML_OP_MUL_MAT, GGML_OP_MUL_MAT,
GGML_OP_OUT_PROD,
GGML_OP_SCALE, GGML_OP_SCALE,
GGML_OP_SET, GGML_OP_SET,
@ -324,6 +326,7 @@ extern "C" {
GGML_OP_DIAG_MASK_INF, GGML_OP_DIAG_MASK_INF,
GGML_OP_DIAG_MASK_ZERO, GGML_OP_DIAG_MASK_ZERO,
GGML_OP_SOFT_MAX, GGML_OP_SOFT_MAX,
GGML_OP_SOFT_MAX_BACK,
GGML_OP_ROPE, GGML_OP_ROPE,
GGML_OP_ROPE_BACK, GGML_OP_ROPE_BACK,
GGML_OP_ALIBI, GGML_OP_ALIBI,
@ -333,10 +336,14 @@ extern "C" {
GGML_OP_FLASH_ATTN, GGML_OP_FLASH_ATTN,
GGML_OP_FLASH_FF, GGML_OP_FLASH_FF,
GGML_OP_FLASH_ATTN_BACK,
GGML_OP_MAP_UNARY, GGML_OP_MAP_UNARY,
GGML_OP_MAP_BINARY, GGML_OP_MAP_BINARY,
GGML_OP_CROSS_ENTROPY_LOSS,
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
GGML_OP_COUNT, GGML_OP_COUNT,
}; };
@ -478,6 +485,7 @@ extern "C" {
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor); GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
// use this to compute the memory overhead of a tensor // use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void); GGML_API size_t ggml_tensor_overhead(void);
@ -574,6 +582,11 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_add1_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_acc( GGML_API struct ggml_tensor * ggml_acc(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -645,6 +658,11 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_repeat_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_abs( GGML_API struct ggml_tensor * ggml_abs(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
@ -698,14 +716,22 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// A: m rows, n columns // A: n columns, m rows
// B: p rows, n columns (i.e. we transpose it internally) // B: n columns, p rows (i.e. we transpose it internally)
// result is m columns, p rows // result is m columns, p rows
GGML_API struct ggml_tensor * ggml_mul_mat( GGML_API struct ggml_tensor * ggml_mul_mat(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// A: m columns, n rows,
// B: p columns, n rows,
// result is m columns, p rows
GGML_API struct ggml_tensor * ggml_out_prod(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// //
// operations on tensors without backpropagation // operations on tensors without backpropagation
// //
@ -916,6 +942,17 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_soft_max_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// rotary position embedding // rotary position embedding
// if mode & 1 == 1, skip n_past elements // if mode & 1 == 1, skip n_past elements
// if mode & 2 == 1, GPT-NeoX style // if mode & 2 == 1, GPT-NeoX style
@ -982,6 +1019,14 @@ extern "C" {
struct ggml_tensor * v, struct ggml_tensor * v,
bool masked); bool masked);
GGML_API struct ggml_tensor * ggml_flash_attn_back(
struct ggml_context * ctx,
struct ggml_tensor * q,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * d,
bool masked);
GGML_API struct ggml_tensor * ggml_flash_ff( GGML_API struct ggml_tensor * ggml_flash_ff(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -1005,6 +1050,19 @@ extern "C" {
struct ggml_tensor * b, struct ggml_tensor * b,
ggml_binary_op_f32_t fun); ggml_binary_op_f32_t fun);
// loss function
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c);
// //
// automatic differentiation // automatic differentiation
// //
@ -1099,6 +1157,8 @@ extern "C" {
struct { struct {
int n_iter; int n_iter;
float sched; // schedule multiplier (fixed, decay or warmup)
float decay; // weight decay for AdamW, use 0.0f to disable
float alpha; // learning rate float alpha; // learning rate
float beta1; float beta1;
float beta2; float beta2;
@ -1123,6 +1183,49 @@ extern "C" {
} lbfgs; } lbfgs;
}; };
struct ggml_opt_context {
struct ggml_context * ctx;
struct ggml_opt_params params;
int iter;
int64_t nx; // number of parameter elements
bool just_initialized;
struct {
struct ggml_tensor * x; // view of the parameters
struct ggml_tensor * g1; // gradient
struct ggml_tensor * g2; // gradient squared
struct ggml_tensor * m; // first moment
struct ggml_tensor * v; // second moment
struct ggml_tensor * mh; // first moment hat
struct ggml_tensor * vh; // second moment hat
struct ggml_tensor * pf; // past function values
float fx_best;
float fx_prev;
int n_no_improvement;
} adam;
struct {
struct ggml_tensor * x; // current parameters
struct ggml_tensor * xp; // previous parameters
struct ggml_tensor * g; // current gradient
struct ggml_tensor * gp; // previous gradient
struct ggml_tensor * d; // search direction
struct ggml_tensor * pf; // past function values
struct ggml_tensor * lmal; // the L-BFGS memory alpha
struct ggml_tensor * lmys; // the L-BFGS memory ys
struct ggml_tensor * lms; // the L-BFGS memory s
struct ggml_tensor * lmy; // the L-BFGS memory y
float fx_best;
float step;
int j;
int k;
int end;
int n_no_improvement;
} lbfgs;
};
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
// optimize the function defined by the tensor f // optimize the function defined by the tensor f
@ -1131,6 +1234,27 @@ extern "C" {
struct ggml_opt_params params, struct ggml_opt_params params,
struct ggml_tensor * f); struct ggml_tensor * f);
// initialize optimizer context
GGML_API void ggml_opt_init(
struct ggml_context * ctx,
struct ggml_opt_context * opt,
struct ggml_opt_params params,
int64_t nx);
// continue optimizing the function defined by the tensor f
GGML_API enum ggml_opt_result ggml_opt_resume(
struct ggml_context * ctx,
struct ggml_opt_context * opt,
struct ggml_tensor * f);
// continue optimizing the function defined by the tensor f
GGML_API enum ggml_opt_result ggml_opt_resume_g(
struct ggml_context * ctx,
struct ggml_opt_context * opt,
struct ggml_tensor * f,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb);
// //
// quantization // quantization
// //

178
llama.cpp
View file

@ -40,6 +40,10 @@
#include <sstream> #include <sstream>
#include <numeric> #include <numeric>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define LLAMA_USE_SCRATCH #define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16 #define LLAMA_MAX_SCRATCH_BUFFERS 16
@ -165,6 +169,11 @@ struct llama_kv_cache {
if (ctx) { if (ctx) {
ggml_free(ctx); ggml_free(ctx);
} }
#ifdef GGML_USE_CUBLAS
ggml_cuda_free_data(k);
ggml_cuda_free_data(v);
#endif // GGML_USE_CUBLAS
} }
}; };
@ -210,6 +219,7 @@ struct llama_model {
for (size_t i = 0; i < tensors_by_name.size(); ++i) { for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cuda_free_data(tensors_by_name[i].second); ggml_cuda_free_data(tensors_by_name[i].second);
} }
ggml_cuda_free_scratch();
#elif defined(GGML_USE_CLBLAST) #elif defined(GGML_USE_CLBLAST)
for (size_t i = 0; i < tensors_by_name.size(); ++i) { for (size_t i = 0; i < tensors_by_name.size(); ++i) {
ggml_cl_free_data(tensors_by_name[i].second); ggml_cl_free_data(tensors_by_name[i].second);
@ -867,7 +877,8 @@ static bool kv_cache_init(
const struct llama_hparams & hparams, const struct llama_hparams & hparams,
struct llama_kv_cache & cache, struct llama_kv_cache & cache,
ggml_type wtype, ggml_type wtype,
int n_ctx) { int n_ctx,
int n_gpu_layers) {
const int n_embd = hparams.n_embd; const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer; const int n_layer = hparams.n_layer;
@ -893,6 +904,15 @@ static bool kv_cache_init(
ggml_set_name(cache.k, "cache_k"); ggml_set_name(cache.k, "cache_k");
ggml_set_name(cache.v, "cache_v"); ggml_set_name(cache.v, "cache_v");
#ifdef GGML_USE_CUBLAS
if (n_gpu_layers > n_layer + 1) {
ggml_cuda_assign_buffers_no_scratch(cache.v);
}
if (n_gpu_layers > n_layer + 2) {
ggml_cuda_assign_buffers_no_scratch(cache.k);
}
#endif // GGML_USE_CUBLAS
return true; return true;
} }
@ -903,6 +923,7 @@ struct llama_context_params llama_context_default_params() {
/*.gpu_layers =*/ 0, /*.gpu_layers =*/ 0,
/*.main_gpu =*/ 0, /*.main_gpu =*/ 0,
/*.tensor_split =*/ {0}, /*.tensor_split =*/ {0},
/*.low_vram =*/ false,
/*.seed =*/ -1, /*.seed =*/ -1,
/*.f16_kv =*/ true, /*.f16_kv =*/ true,
/*.logits_all =*/ false, /*.logits_all =*/ false,
@ -1011,6 +1032,7 @@ static void llama_model_load_internal(
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
const float * tensor_split, const float * tensor_split,
bool low_vram,
ggml_type memory_type, ggml_type memory_type,
bool use_mmap, bool use_mmap,
bool use_mlock, bool use_mlock,
@ -1036,6 +1058,12 @@ static void llama_model_load_internal(
case 40: model.type = e_model::MODEL_13B; break; case 40: model.type = e_model::MODEL_13B; break;
case 60: model.type = e_model::MODEL_30B; break; case 60: model.type = e_model::MODEL_30B; break;
case 80: model.type = e_model::MODEL_65B; break; case 80: model.type = e_model::MODEL_65B; break;
default:
{
if (hparams.n_layer < 32) {
model.type = e_model::MODEL_7B;
}
} break;
} }
hparams.n_ctx = n_ctx; hparams.n_ctx = n_ctx;
@ -1131,18 +1159,34 @@ static void llama_model_load_internal(
ml->ggml_ctx = ctx; ml->ggml_ctx = ctx;
model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU); model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
model.norm = ml->get_tensor("norm.weight", {n_embd}, GGML_BACKEND_CPU);
// "output" tensor // "output" tensor
{ {
ggml_backend backend_norm;
ggml_backend backend_output; ggml_backend backend_output;
if (n_gpu_layers > int(n_layer)) { // NOLINT if (n_gpu_layers > int(n_layer)) { // NOLINT
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
#else
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
#endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
} else { } else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU; backend_output = GGML_BACKEND_CPU;
} }
model.norm = ml->get_tensor("norm.weight", {n_embd}, backend_norm);
model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output); model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
vram_weights += ggml_nbytes(model.norm);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
vram_weights += ggml_nbytes(model.output);
}
} }
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
@ -1200,23 +1244,49 @@ static void llama_model_load_internal(
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
(void) vram_scratch; (void) vram_scratch;
(void) n_batch;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
if (low_vram) {
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
ggml_cuda_set_scratch_size(0); // disable scratch
} else {
vram_scratch = n_batch * MB; vram_scratch = n_batch * MB;
ggml_cuda_set_scratch_size(vram_scratch); ggml_cuda_set_scratch_size(vram_scratch);
if (n_gpu_layers > 0) { if (n_gpu_layers > 0) {
fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n", fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
__func__, vram_scratch / MB); __func__, vram_scratch / MB);
} }
}
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu); fprintf(stderr, "%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
if (n_gpu_layers > (int) hparams.n_layer) { if (n_gpu_layers > (int) hparams.n_layer) {
fprintf(stderr, "%s: offloading output layer to GPU\n", __func__); fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
} }
size_t vram_kv_cache = 0;
if (n_gpu_layers > (int) hparams.n_layer + 1) {
if (low_vram) {
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
} else {
fprintf(stderr, "%s: offloading v cache to GPU\n", __func__);
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
}
}
if (n_gpu_layers > (int) hparams.n_layer + 2) {
if (low_vram) {
fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
} else {
fprintf(stderr, "%s: offloading k cache to GPU\n", __func__);
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
}
}
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
fprintf(stderr, "%s: total VRAM used: %zu MB\n", fprintf(stderr, "%s: total VRAM used: %zu MB\n",
__func__, (vram_weights + vram_scratch + MB - 1) / MB); // round up __func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
#else #else
(void) n_gpu_layers; (void) n_gpu_layers;
#endif #endif
@ -1227,6 +1297,7 @@ static void llama_model_load_internal(
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor); model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
} }
(void) tensor_split;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUBLAS)
{ {
ggml_cuda_set_tensor_split(tensor_split); ggml_cuda_set_tensor_split(tensor_split);
@ -1254,6 +1325,7 @@ static bool llama_model_load(
int n_gpu_layers, int n_gpu_layers,
int main_gpu, int main_gpu,
float * tensor_split, float * tensor_split,
bool low_vram,
ggml_type memory_type, ggml_type memory_type,
bool use_mmap, bool use_mmap,
bool use_mlock, bool use_mlock,
@ -1261,7 +1333,7 @@ static bool llama_model_load(
llama_progress_callback progress_callback, llama_progress_callback progress_callback,
void *progress_callback_user_data) { void *progress_callback_user_data) {
try { try {
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, memory_type, llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
return true; return true;
} catch (const std::exception & err) { } catch (const std::exception & err) {
@ -1337,12 +1409,33 @@ static bool llama_eval_internal(
const int i_gpu_start = n_layer - n_gpu_layers; const int i_gpu_start = n_layer - n_gpu_layers;
(void) i_gpu_start; (void) i_gpu_start;
// offload functions set the tensor output backend to GPU
// tensors are GPU-accelerated if any input or the output has been offloaded
//
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
// in that case ggml_cuda_assign_buffers has no effect
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
offload_func_t offload_func_kq = llama_nop;
offload_func_t offload_func_v = llama_nop;
#ifdef GGML_USE_CUBLAS
if (n_gpu_layers > n_layer) {
offload_func_nr = ggml_cuda_assign_buffers;
}
if (n_gpu_layers > n_layer + 1) {
offload_func_v = ggml_cuda_assign_buffers;
}
if (n_gpu_layers > n_layer + 2) {
offload_func_kq = ggml_cuda_assign_buffers;
}
#endif // GGML_USE_CUBLAS
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
offload_func_t offload_func = llama_nop; offload_func_t offload_func = llama_nop;
#ifdef GGML_USE_CUBLAS #ifdef GGML_USE_CUBLAS
if (il >= i_gpu_start) { if (il >= i_gpu_start) {
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU offload_func = ggml_cuda_assign_buffers;
} }
#endif // GGML_USE_CUBLAS #endif // GGML_USE_CUBLAS
@ -1365,31 +1458,42 @@ static bool llama_eval_internal(
// self-attention // self-attention
{ {
// compute Q and K and RoPE them // compute Q and K and RoPE them
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
// offload_func(tmpq);
ggml_set_name(tmpq, "tmpq");
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur); struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
// offload_func(tmpk); offload_func_kq(tmpk);
ggml_set_name(tmpk, "tmpk"); ggml_set_name(tmpk, "tmpk");
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
offload_func_kq(tmpq);
ggml_set_name(tmpq, "tmpq");
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0); struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0);
offload_func_kq(Kcur);
ggml_set_name(Kcur, "Kcur"); ggml_set_name(Kcur, "Kcur");
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0); struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0);
offload_func_kq(Qcur);
ggml_set_name(Qcur, "Qcur"); ggml_set_name(Qcur, "Qcur");
// store key and value to memory // store key and value to memory
{ {
// compute the transposed [N, n_embd] V matrix // compute the transposed [N, n_embd] V matrix
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
offload_func_v(tmpv);
ggml_set_name(tmpv, "tmpv");
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N));
offload_func_v(Vcur);
ggml_set_name(Vcur, "Vcur"); ggml_set_name(Vcur, "Vcur");
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
offload_func_kq(k);
ggml_set_name(k, "k"); ggml_set_name(k, "k");
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
( n_ctx)*ggml_element_size(kv_self.v), ( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v));
offload_func_v(v);
ggml_set_name(v, "v"); ggml_set_name(v, "v");
// important: storing RoPE-ed version of K in the KV cache! // important: storing RoPE-ed version of K in the KV cache!
@ -1401,6 +1505,7 @@ static bool llama_eval_internal(
ggml_permute(ctx0, ggml_permute(ctx0,
Qcur, Qcur,
0, 2, 1, 3); 0, 2, 1, 3);
offload_func_kq(Q);
ggml_set_name(Q, "Q"); ggml_set_name(Q, "Q");
struct ggml_tensor * K = struct ggml_tensor * K =
@ -1409,10 +1514,12 @@ static bool llama_eval_internal(
ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
n_embd/n_head, n_head, n_past + N), n_embd/n_head, n_head, n_past + N),
0, 2, 1, 3); 0, 2, 1, 3);
offload_func_kq(K);
ggml_set_name(K, "K"); ggml_set_name(K, "K");
// K * Q // K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
offload_func_kq(KQ);
ggml_set_name(KQ, "KQ"); ggml_set_name(KQ, "KQ");
// KQ_scaled = KQ / sqrt(n_embd/n_head) // KQ_scaled = KQ / sqrt(n_embd/n_head)
@ -1421,14 +1528,17 @@ static bool llama_eval_internal(
// KQ_scaled shape [n_past + N, N, n_head, 1] // KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
offload_func_kq(KQ_scaled);
ggml_set_name(KQ_scaled, "KQ_scaled"); ggml_set_name(KQ_scaled, "KQ_scaled");
// KQ_masked = mask_past(KQ_scaled) // KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past); struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
offload_func_kq(KQ_masked);
ggml_set_name(KQ_masked, "KQ_masked"); ggml_set_name(KQ_masked, "KQ_masked");
// KQ = soft_max(KQ_masked) // KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
offload_func_v(KQ_soft_max);
ggml_set_name(KQ_soft_max, "KQ_soft_max"); ggml_set_name(KQ_soft_max, "KQ_soft_max");
// split cached V into n_head heads // split cached V into n_head heads
@ -1438,10 +1548,12 @@ static bool llama_eval_internal(
n_ctx*ggml_element_size(kv_self.v), n_ctx*ggml_element_size(kv_self.v),
n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head, n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head,
il*n_ctx*ggml_element_size(kv_self.v)*n_embd); il*n_ctx*ggml_element_size(kv_self.v)*n_embd);
offload_func_v(V);
ggml_set_name(V, "V"); ggml_set_name(V, "V");
#if 1 #if 1
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
offload_func_v(KQV);
ggml_set_name(KQV, "KQV"); ggml_set_name(KQV, "KQV");
#else #else
// make V contiguous in memory to speed up the matmul, however we waste time on the copy // make V contiguous in memory to speed up the matmul, however we waste time on the copy
@ -1453,12 +1565,14 @@ static bool llama_eval_internal(
// KQV_merged = KQV.permute(0, 2, 1, 3) // KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
offload_func_v(KQV_merged);
ggml_set_name(KQV_merged, "KQV_merged"); ggml_set_name(KQV_merged, "KQV_merged");
// cur = KQV_merged.contiguous().view(n_embd, N) // cur = KQV_merged.contiguous().view(n_embd, N)
cur = ggml_cpy(ctx0, cur = ggml_cpy(ctx0,
KQV_merged, KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
offload_func_v(cur);
ggml_set_name(cur, "KQV_merged_contiguous"); ggml_set_name(cur, "KQV_merged_contiguous");
// projection (no bias) // projection (no bias)
@ -1470,7 +1584,6 @@ static bool llama_eval_internal(
} }
lctx.use_buf(ctx0, 1); lctx.use_buf(ctx0, 1);
//ggml_cuda_set_scratch(1);
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
offload_func(inpFF); offload_func(inpFF);
@ -1528,32 +1641,24 @@ static bool llama_eval_internal(
} }
lctx.use_buf(ctx0, 0); lctx.use_buf(ctx0, 0);
//ggml_cuda_set_scratch(0);
// used at the end to optionally extract the embeddings // used at the end to optionally extract the embeddings
struct ggml_tensor * embeddings = NULL; struct ggml_tensor * embeddings = NULL;
offload_func_t offload_func = llama_nop;
#ifdef GGML_USE_CUBLAS
if (n_gpu_layers > n_layer) {
offload_func = ggml_cuda_assign_buffers; // sets the output backend to GPU
}
#endif // GGML_USE_CUBLAS
// norm // norm
{ {
cur = ggml_rms_norm(ctx0, inpL); cur = ggml_rms_norm(ctx0, inpL);
offload_func(cur); offload_func_nr(cur);
ggml_set_name(cur, "rms_norm_inpL"); ggml_set_name(cur, "rms_norm_inpL");
cur = ggml_rms_norm(ctx0, cur); cur = ggml_rms_norm(ctx0, cur);
offload_func(cur); offload_func_nr(cur);
ggml_set_name(cur, "rms_norm_after"); ggml_set_name(cur, "rms_norm_after");
// cur = cur*norm(broadcasted) // cur = cur*norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.norm); cur = ggml_mul(ctx0, cur, model.norm);
offload_func(cur); // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
ggml_set_name(cur, "result_norm"); ggml_set_name(cur, "result_norm");
embeddings = cur; embeddings = cur;
@ -2161,6 +2266,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
return -log2f(candidate.p) > *mu; return -log2f(candidate.p) > *mu;
})); }));
if (candidates->size == 0) {
candidates->size = 1;
}
// Normalize the probabilities of the remaining words // Normalize the probabilities of the remaining words
llama_sample_softmax(ctx, candidates); llama_sample_softmax(ctx, candidates);
@ -2540,8 +2649,8 @@ struct llama_context * llama_init_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers, params.main_gpu,
params.main_gpu, params.tensor_split, memory_type, params.use_mmap, params.use_mlock, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
fprintf(stderr, "%s: failed to load model\n", __func__); fprintf(stderr, "%s: failed to load model\n", __func__);
llama_free(ctx); llama_free(ctx);
@ -2550,7 +2659,7 @@ struct llama_context * llama_init_from_file(
// reserve memory for context buffers // reserve memory for context buffers
if (!params.vocab_only) { if (!params.vocab_only) {
if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) { if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx); llama_free(ctx);
return nullptr; return nullptr;
@ -3287,6 +3396,19 @@ int llama_n_embd(const struct llama_context * ctx) {
return ctx->model.hparams.n_embd; return ctx->model.hparams.n_embd;
} }
int llama_get_vocab(
const struct llama_context * ctx,
const char * * strings,
float * scores,
int capacity) {
int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
for (int i = 0; i<n; ++i) {
strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
scores[i] = ctx->vocab.id_to_token[i].score;
}
return n;
}
float * llama_get_logits(struct llama_context * ctx) { float * llama_get_logits(struct llama_context * ctx) {
return ctx->logits.data(); return ctx->logits.data();
} }

15
llama.h
View file

@ -77,6 +77,7 @@ extern "C" {
int n_gpu_layers; // number of layers to store in VRAM int n_gpu_layers; // number of layers to store in VRAM
int main_gpu; // the GPU that is used for scratch and small tensors int main_gpu; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
bool low_vram; // if true, reduce VRAM usage at the cost of performance
int seed; // RNG seed, -1 for random int seed; // RNG seed, -1 for random
bool f16_kv; // use fp16 for KV cache bool f16_kv; // use fp16 for KV cache
@ -220,6 +221,14 @@ extern "C" {
LLAMA_API int llama_n_ctx (const struct llama_context * ctx); LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx); LLAMA_API int llama_n_embd (const struct llama_context * ctx);
// Get the vocabulary as output parameters.
// Returns number of results.
LLAMA_API int llama_get_vocab(
const struct llama_context * ctx,
const char * * strings,
float * scores,
int capacity);
// Token logits obtained from the last call to llama_eval() // Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row // The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token // Can be mutated in order to change the probabilities of the next token
@ -235,9 +244,9 @@ extern "C" {
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
// Special tokens // Special tokens
LLAMA_API llama_token llama_token_bos(); LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(); LLAMA_API llama_token llama_token_eos(); // end-of-sentence
LLAMA_API llama_token llama_token_nl(); LLAMA_API llama_token llama_token_nl(); // next-line
// Sampling functions // Sampling functions

View file

@ -10,6 +10,10 @@
#include <ggml.h> #include <ggml.h>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
constexpr int kVecSize = 1 << 18; constexpr int kVecSize = 1 << 18;
float drawFromGaussianPdf(std::mt19937& rndm) { float drawFromGaussianPdf(std::mt19937& rndm) {

1
spm-headers/ggml.h Symbolic link
View file

@ -0,0 +1 @@
../ggml.h

View file

@ -5,7 +5,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <assert.h> #include <assert.h>
#define MAX_NARGS 2 #define MAX_NARGS 3
#undef MIN #undef MIN
#undef MAX #undef MAX
@ -1090,6 +1090,25 @@ int main(int argc, const char ** argv) {
} }
} }
// cross_entropy_loss
{
const int nargs = 1;
int64_t ne2[4];
get_random_dims(ne2, 4);
for (int ndims = 1; ndims <= 3; ++ndims) {
x[0] = get_random_tensor(ctx0, ndims, ne2, -1.0f, 1.0f);
x[1] = get_random_tensor(ctx0, ndims, ne2, 0.0f, 1.0f);
ggml_set_param(ctx0, x[0]);
struct ggml_tensor * f = ggml_sum(ctx0, ggml_cross_entropy_loss(ctx0, x[0], x[1]));
check_gradient("cross_entropy_loss", ctx0, x, f, ndims, nargs, 1e-1f, 1e-2f, INFINITY);
// finite differences regularly fails!
}
}
// rope // rope
{ {
const int nargs = 1; const int nargs = 1;
@ -1124,6 +1143,45 @@ int main(int argc, const char ** argv) {
} }
} }
// flash_attn
{
const int nargs = 3;
int64_t ne2[4];
get_random_dims(ne2, 4);
int64_t D = ne2[0];
int64_t N = ne2[1];
int64_t M = ne2[2] + N;
int64_t B = ne2[3];
for (int masked = 0; masked <= 1; ++masked) {
for (int ndims = 2; ndims <= 4; ++ndims) {
int64_t neq[4] = { D, N, B, ne[3] };
int64_t nek[4] = { D, M, B, ne[3] };
int64_t nev[4] = { M, D, B, ne[3] };
if (ndims == 2) {
neq[2] = 1; neq[3] = 1;
nek[2] = 1; nek[3] = 1;
nev[2] = 1; nev[3] = 1;
} else if (ndims == 3) {
neq[3] = 1;
nek[3] = 1;
nev[3] = 1;
}
x[0] = get_random_tensor(ctx0, ndims, neq, -0.1250f, 0.1250f);
x[1] = get_random_tensor(ctx0, ndims, nek, -0.1250f, 0.1250f);
x[2] = get_random_tensor(ctx0, ndims, nev, -0.1250f, 0.1250f);
ggml_set_param(ctx0, x[0]);
ggml_set_param(ctx0, x[1]);
ggml_set_param(ctx0, x[2]);
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
check_gradient("flash_attn", ctx0, x, f, ndims, nargs, 1.5e-4f, INFINITY, 3.5f);
}
}
}
ggml_free(ctx0); ggml_free(ctx0);
} }

View file

@ -9,12 +9,15 @@
#include <string> #include <string>
#include <vector> #include <vector>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001; const float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002; const float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075; const float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040; const float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
const float MAX_DOT_PRODUCT_ERROR = 0.02; const float MAX_DOT_PRODUCT_ERROR = 0.02f;
const char* RESULT_STR[] = {"ok", "FAILED"}; const char* RESULT_STR[] = {"ok", "FAILED"};

View file

@ -13,6 +13,10 @@
#include <string> #include <string>
#include <vector> #include <vector>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define MAX_ALIGNMENT 64 #define MAX_ALIGNMENT 64
#define QK 32 #define QK 32
#define WARMUP 5 #define WARMUP 5

View file

@ -176,27 +176,27 @@ void test_frequency_presence_penalty(
int main(void) { int main(void) {
ggml_time_init(); ggml_time_init();
test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4}, 1); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
test_top_k({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2}, 3); test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4}, 0); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3}, 0.7); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
test_top_p({0.1, 0.2, 0.3, 0.4}, {0.4, 0.3, 0.2, 0.1}, 1); test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3}, 0.25); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.75); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
test_tfs({0.1, 0.15, 0.2, 0.25, 0.3}, {0.3, 0.25}, 0.99); test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
test_typical({0.97, 0.01, 0.01, 0.01}, {0.97}, 0.5); test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
test_typical({0.4, 0.2, 0.2, 0.2}, {0.2, 0.2, 0.2}, 0.5); test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.25, 0.25, 0.25, 0.25, 0}, 50.0); test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f);
test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.5, 0.5, 0, 0, 0}, 50.0); test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
test_repetition_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.5, 0.5, 0, 0, 0}, 50.0); test_repetition_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f);
test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0}, {0.249997, 0.249997, 0.249997, 0.249997, 0.000011}, 5.0, 5.0); test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 5.0f, 5.0f);
test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2}, {0.499966, 0.499966, 0.000023, 0.000023, 0.000023}, 5.0, 5.0); test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 5.0f, 5.0f);
test_frequency_presence_penalty({0.2, 0.2, 0.2, 0.2, 0.2}, {0, 1, 2, 0, 0}, {0.499977, 0.499977, 0.000023, 0.000023, 0.000000}, 5.0, 5.0); test_frequency_presence_penalty({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 5.0f, 5.0f);
printf("OK\n"); printf("OK\n");
} }

View file

@ -53,7 +53,7 @@ int main(int argc, char **argv) {
for (const auto & test_kv : k_tests()) { for (const auto & test_kv : k_tests()) {
std::vector<llama_token> res(test_kv.first.size()); std::vector<llama_token> res(test_kv.first.size());
const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), res.size(), true); const int n = llama_tokenize(ctx, test_kv.first.c_str(), res.data(), int(res.size()), true);
res.resize(n); res.resize(n);
bool correct = res.size() == test_kv.second.size(); bool correct = res.size() == test_kv.second.size();