Merge branch 'ggerganov:master' into master
This commit is contained in:
commit
edf46a38ff
5 changed files with 138 additions and 12 deletions
|
@ -13,18 +13,22 @@
|
||||||
cudaPackages,
|
cudaPackages,
|
||||||
darwin,
|
darwin,
|
||||||
rocmPackages,
|
rocmPackages,
|
||||||
|
vulkan-headers,
|
||||||
|
vulkan-loader,
|
||||||
clblast,
|
clblast,
|
||||||
useBlas ? builtins.all (x: !x) [
|
useBlas ? builtins.all (x: !x) [
|
||||||
useCuda
|
useCuda
|
||||||
useMetalKit
|
useMetalKit
|
||||||
useOpenCL
|
useOpenCL
|
||||||
useRocm
|
useRocm
|
||||||
|
useVulkan
|
||||||
],
|
],
|
||||||
useCuda ? config.cudaSupport,
|
useCuda ? config.cudaSupport,
|
||||||
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
|
||||||
useMpi ? false, # Increases the runtime closure size by ~700M
|
useMpi ? false, # Increases the runtime closure size by ~700M
|
||||||
useOpenCL ? false,
|
useOpenCL ? false,
|
||||||
useRocm ? config.rocmSupport,
|
useRocm ? config.rocmSupport,
|
||||||
|
useVulkan ? false,
|
||||||
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
|
||||||
}@inputs:
|
}@inputs:
|
||||||
|
|
||||||
|
@ -48,7 +52,8 @@ let
|
||||||
++ lib.optionals useMetalKit [ "MetalKit" ]
|
++ lib.optionals useMetalKit [ "MetalKit" ]
|
||||||
++ lib.optionals useMpi [ "MPI" ]
|
++ lib.optionals useMpi [ "MPI" ]
|
||||||
++ lib.optionals useOpenCL [ "OpenCL" ]
|
++ lib.optionals useOpenCL [ "OpenCL" ]
|
||||||
++ lib.optionals useRocm [ "ROCm" ];
|
++ lib.optionals useRocm [ "ROCm" ]
|
||||||
|
++ lib.optionals useVulkan [ "Vulkan" ];
|
||||||
|
|
||||||
pnameSuffix =
|
pnameSuffix =
|
||||||
strings.optionalString (suffices != [ ])
|
strings.optionalString (suffices != [ ])
|
||||||
|
@ -108,6 +113,11 @@ let
|
||||||
hipblas
|
hipblas
|
||||||
rocblas
|
rocblas
|
||||||
];
|
];
|
||||||
|
|
||||||
|
vulkanBuildInputs = [
|
||||||
|
vulkan-headers
|
||||||
|
vulkan-loader
|
||||||
|
];
|
||||||
in
|
in
|
||||||
|
|
||||||
effectiveStdenv.mkDerivation (
|
effectiveStdenv.mkDerivation (
|
||||||
|
@ -164,7 +174,8 @@ effectiveStdenv.mkDerivation (
|
||||||
++ optionals useCuda cudaBuildInputs
|
++ optionals useCuda cudaBuildInputs
|
||||||
++ optionals useMpi [ mpi ]
|
++ optionals useMpi [ mpi ]
|
||||||
++ optionals useOpenCL [ clblast ]
|
++ optionals useOpenCL [ clblast ]
|
||||||
++ optionals useRocm rocmBuildInputs;
|
++ optionals useRocm rocmBuildInputs
|
||||||
|
++ optionals useVulkan vulkanBuildInputs;
|
||||||
|
|
||||||
cmakeFlags =
|
cmakeFlags =
|
||||||
[
|
[
|
||||||
|
@ -178,6 +189,7 @@ effectiveStdenv.mkDerivation (
|
||||||
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
||||||
(cmakeBool "LLAMA_METAL" useMetalKit)
|
(cmakeBool "LLAMA_METAL" useMetalKit)
|
||||||
(cmakeBool "LLAMA_MPI" useMpi)
|
(cmakeBool "LLAMA_MPI" useMpi)
|
||||||
|
(cmakeBool "LLAMA_VULKAN" useVulkan)
|
||||||
]
|
]
|
||||||
++ optionals useCuda [
|
++ optionals useCuda [
|
||||||
(
|
(
|
||||||
|
@ -218,6 +230,7 @@ effectiveStdenv.mkDerivation (
|
||||||
useMpi
|
useMpi
|
||||||
useOpenCL
|
useOpenCL
|
||||||
useRocm
|
useRocm
|
||||||
|
useVulkan
|
||||||
;
|
;
|
||||||
|
|
||||||
shell = mkShell {
|
shell = mkShell {
|
||||||
|
@ -242,11 +255,11 @@ effectiveStdenv.mkDerivation (
|
||||||
# Configurations we don't want even the CI to evaluate. Results in the
|
# Configurations we don't want even the CI to evaluate. Results in the
|
||||||
# "unsupported platform" messages. This is mostly a no-op, because
|
# "unsupported platform" messages. This is mostly a no-op, because
|
||||||
# cudaPackages would've refused to evaluate anyway.
|
# cudaPackages would've refused to evaluate anyway.
|
||||||
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
|
badPlatforms = optionals (useCuda || useOpenCL || useVulkan) lib.platforms.darwin;
|
||||||
|
|
||||||
# Configurations that are known to result in build failures. Can be
|
# Configurations that are known to result in build failures. Can be
|
||||||
# overridden by importing Nixpkgs with `allowBroken = true`.
|
# overridden by importing Nixpkgs with `allowBroken = true`.
|
||||||
broken = (useMetalKit && !effectiveStdenv.isDarwin);
|
broken = (useMetalKit && !effectiveStdenv.isDarwin) || (useVulkan && effectiveStdenv.isDarwin);
|
||||||
|
|
||||||
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
|
||||||
homepage = "https://github.com/ggerganov/llama.cpp/";
|
homepage = "https://github.com/ggerganov/llama.cpp/";
|
||||||
|
|
|
@ -79,7 +79,7 @@ if (NOT MSVC)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
option(LLAMA_WIN_VER "llama: Windows Version" 0x602)
|
set(LLAMA_WIN_VER "0x602" CACHE STRING "llama: Windows Version")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# 3rd party libs
|
# 3rd party libs
|
||||||
|
|
10
Makefile
10
Makefile
|
@ -109,6 +109,7 @@ MK_NVCCFLAGS += -O3
|
||||||
else
|
else
|
||||||
MK_CFLAGS += -O3
|
MK_CFLAGS += -O3
|
||||||
MK_CXXFLAGS += -O3
|
MK_CXXFLAGS += -O3
|
||||||
|
MK_NVCCFLAGS += -O3
|
||||||
endif
|
endif
|
||||||
|
|
||||||
# clock_gettime came in POSIX.1b (1993)
|
# clock_gettime came in POSIX.1b (1993)
|
||||||
|
@ -365,7 +366,7 @@ ifdef LLAMA_CUBLAS
|
||||||
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
|
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
|
||||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
MK_NVCCFLAGS = -use_fast_math
|
MK_NVCCFLAGS += -use_fast_math
|
||||||
ifndef JETSON_EOL_MODULE_DETECT
|
ifndef JETSON_EOL_MODULE_DETECT
|
||||||
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
MK_NVCCFLAGS += --forward-unknown-to-host-compiler
|
||||||
endif # JETSON_EOL_MODULE_DETECT
|
endif # JETSON_EOL_MODULE_DETECT
|
||||||
|
@ -552,8 +553,11 @@ $(info I CFLAGS: $(CFLAGS))
|
||||||
$(info I CXXFLAGS: $(CXXFLAGS))
|
$(info I CXXFLAGS: $(CXXFLAGS))
|
||||||
$(info I NVCCFLAGS: $(NVCCFLAGS))
|
$(info I NVCCFLAGS: $(NVCCFLAGS))
|
||||||
$(info I LDFLAGS: $(LDFLAGS))
|
$(info I LDFLAGS: $(LDFLAGS))
|
||||||
$(info I CC: $(shell $(CC) --version | head -n 1))
|
$(info I CC: $(shell $(CC) --version | head -n 1))
|
||||||
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
$(info I CXX: $(shell $(CXX) --version | head -n 1))
|
||||||
|
ifdef LLAMA_CUBLAS
|
||||||
|
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
|
||||||
|
endif # LLAMA_CUBLAS
|
||||||
$(info )
|
$(info )
|
||||||
|
|
||||||
#
|
#
|
||||||
|
|
|
@ -36,6 +36,8 @@ public:
|
||||||
void set_parameters(StatParams&& params) { m_params = std::move(params); }
|
void set_parameters(StatParams&& params) { m_params = std::move(params); }
|
||||||
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
||||||
void save_imatrix() const;
|
void save_imatrix() const;
|
||||||
|
bool load_imatrix(const char * file_name, bool add);
|
||||||
|
static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
|
||||||
private:
|
private:
|
||||||
std::unordered_map<std::string, Stats> m_stats;
|
std::unordered_map<std::string, Stats> m_stats;
|
||||||
StatParams m_params;
|
StatParams m_params;
|
||||||
|
@ -189,6 +191,57 @@ void IMatrixCollector::save_imatrix(const char * fname) const {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
|
||||||
|
std::ifstream in(imatrix_file, std::ios::binary);
|
||||||
|
if (!in) {
|
||||||
|
printf("%s: failed to open %s\n",__func__,imatrix_file);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
int n_entries;
|
||||||
|
in.read((char*)&n_entries, sizeof(n_entries));
|
||||||
|
if (in.fail() || n_entries < 1) {
|
||||||
|
printf("%s: no data in file %s\n", __func__, imatrix_file);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < n_entries; ++i) {
|
||||||
|
int len; in.read((char *)&len, sizeof(len));
|
||||||
|
std::vector<char> name_as_vec(len+1);
|
||||||
|
in.read((char *)name_as_vec.data(), len);
|
||||||
|
if (in.fail()) {
|
||||||
|
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
name_as_vec[len] = 0;
|
||||||
|
std::string name{name_as_vec.data()};
|
||||||
|
auto& e = imatrix_data[std::move(name)];
|
||||||
|
int ncall;
|
||||||
|
in.read((char*)&ncall, sizeof(ncall));
|
||||||
|
int nval;
|
||||||
|
in.read((char *)&nval, sizeof(nval));
|
||||||
|
if (in.fail() || nval < 1) {
|
||||||
|
printf("%s: failed reading number of values for entry %d\n",__func__,i);
|
||||||
|
imatrix_data = {};
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
e.values.resize(nval);
|
||||||
|
in.read((char*)e.values.data(), nval*sizeof(float));
|
||||||
|
if (in.fail()) {
|
||||||
|
printf("%s: failed reading data for entry %d\n",__func__,i);
|
||||||
|
imatrix_data = {};
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
e.ncall = ncall;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
|
||||||
|
if (!add) {
|
||||||
|
m_stats.clear();
|
||||||
|
}
|
||||||
|
return load_imatrix(file_name, m_stats);
|
||||||
|
}
|
||||||
|
|
||||||
static IMatrixCollector g_collector;
|
static IMatrixCollector g_collector;
|
||||||
|
|
||||||
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
|
@ -269,7 +322,7 @@ static void process_logits(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
|
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
|
||||||
|
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
@ -282,6 +335,15 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
|
||||||
|
if (from_chunk > 0) {
|
||||||
|
if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
|
||||||
|
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
|
||||||
|
tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
if (int(tokens.size()) < 2*n_ctx) {
|
if (int(tokens.size()) < 2*n_ctx) {
|
||||||
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
|
fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx,
|
||||||
n_ctx);
|
n_ctx);
|
||||||
|
@ -402,7 +464,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
|
|
||||||
StatParams sparams;
|
StatParams sparams;
|
||||||
|
std::string prev_result_file;
|
||||||
|
std::string combine_files;
|
||||||
bool compute_ppl = true;
|
bool compute_ppl = true;
|
||||||
|
int from_chunk = 0;
|
||||||
std::vector<char*> args;
|
std::vector<char*> args;
|
||||||
args.push_back(argv[0]);
|
args.push_back(argv[0]);
|
||||||
int iarg = 1;
|
int iarg = 1;
|
||||||
|
@ -423,6 +488,13 @@ int main(int argc, char ** argv) {
|
||||||
compute_ppl = false;
|
compute_ppl = false;
|
||||||
} else if (arg == "--keep-imatrix") {
|
} else if (arg == "--keep-imatrix") {
|
||||||
sparams.keep_every = std::stoi(argv[++iarg]);
|
sparams.keep_every = std::stoi(argv[++iarg]);
|
||||||
|
} else if (arg == "--continue-from") {
|
||||||
|
prev_result_file = argv[++iarg];
|
||||||
|
} else if (arg == "--combine") {
|
||||||
|
combine_files = argv[++iarg];
|
||||||
|
}
|
||||||
|
else if (arg == "--from-chunk") {
|
||||||
|
from_chunk = std::stoi(argv[++iarg]);
|
||||||
} else {
|
} else {
|
||||||
args.push_back(argv[iarg]);
|
args.push_back(argv[iarg]);
|
||||||
}
|
}
|
||||||
|
@ -436,14 +508,50 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
g_collector.set_parameters(std::move(sparams));
|
||||||
|
|
||||||
|
if (!combine_files.empty()) {
|
||||||
|
std::vector<std::string> files;
|
||||||
|
size_t pos = 0;
|
||||||
|
while (true) {
|
||||||
|
auto new_pos = combine_files.find(',', pos);
|
||||||
|
if (new_pos != std::string::npos) {
|
||||||
|
files.emplace_back(combine_files.substr(pos, new_pos - pos));
|
||||||
|
pos = new_pos + 1;
|
||||||
|
} else {
|
||||||
|
files.emplace_back(combine_files.substr(pos));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (files.size() < 2) {
|
||||||
|
fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
printf("Combining the following %d files\n", int(files.size()));
|
||||||
|
for (auto& file : files) {
|
||||||
|
printf(" %s\n", file.c_str());
|
||||||
|
if (!g_collector.load_imatrix(file.c_str(), true)) {
|
||||||
|
fprintf(stderr, "Failed to load %s\n", file.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
g_collector.save_imatrix();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!prev_result_file.empty()) {
|
||||||
|
if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
|
||||||
|
fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
params.n_batch = 512;
|
params.n_batch = 512;
|
||||||
if (!gpt_params_parse(args.size(), args.data(), params)) {
|
if (!gpt_params_parse(args.size(), args.data(), params)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
g_collector.set_parameters(std::move(sparams));
|
|
||||||
|
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
|
@ -495,7 +603,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool OK = compute_imatrix(ctx, params, compute_ppl);
|
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
|
||||||
if (!OK) {
|
if (!OK) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -157,6 +157,7 @@
|
||||||
|
|
||||||
mpi-cpu = config.packages.default.override { useMpi = true; };
|
mpi-cpu = config.packages.default.override { useMpi = true; };
|
||||||
mpi-cuda = config.packages.default.override { useMpi = true; };
|
mpi-cuda = config.packages.default.override { useMpi = true; };
|
||||||
|
vulkan = config.packages.default.override { useVulkan = true; };
|
||||||
}
|
}
|
||||||
// lib.optionalAttrs (system == "x86_64-linux") {
|
// lib.optionalAttrs (system == "x86_64-linux") {
|
||||||
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
|
rocm = config.legacyPackages.llamaPackagesRocm.llama-cpp;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue