Merge remote-tracking branch 'origin/master' into llama-model-params
This commit is contained in:
commit
c8a9658e65
30 changed files with 7015 additions and 2423 deletions
32
.github/workflows/build.yml
vendored
32
.github/workflows/build.yml
vendored
|
@ -457,22 +457,22 @@ jobs:
|
||||||
path: |
|
path: |
|
||||||
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
|
||||||
|
|
||||||
freeBSD-latest:
|
# freeBSD-latest:
|
||||||
runs-on: macos-12
|
# runs-on: macos-12
|
||||||
steps:
|
# steps:
|
||||||
- name: Clone
|
# - name: Clone
|
||||||
uses: actions/checkout@v3
|
# uses: actions/checkout@v3
|
||||||
|
#
|
||||||
- name: Build
|
# - name: Build
|
||||||
uses: cross-platform-actions/action@v0.19.0
|
# uses: cross-platform-actions/action@v0.19.0
|
||||||
with:
|
# with:
|
||||||
operating_system: freebsd
|
# operating_system: freebsd
|
||||||
version: '13.2'
|
# version: '13.2'
|
||||||
hypervisor: 'qemu'
|
# hypervisor: 'qemu'
|
||||||
run: |
|
# run: |
|
||||||
sudo pkg update
|
# sudo pkg update
|
||||||
sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
|
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
|
||||||
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
|
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
|
||||||
|
|
||||||
release:
|
release:
|
||||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||||
|
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -52,6 +52,8 @@ models-mnt
|
||||||
/server
|
/server
|
||||||
/simple
|
/simple
|
||||||
/batched
|
/batched
|
||||||
|
/export-lora
|
||||||
|
/finetune
|
||||||
/speculative
|
/speculative
|
||||||
/parallel
|
/parallel
|
||||||
/train-text-from-scratch
|
/train-text-from-scratch
|
||||||
|
|
15
Makefile
15
Makefile
|
@ -1,5 +1,5 @@
|
||||||
# Define the default target now so that it is always the first target
|
# Define the default target now so that it is always the first target
|
||||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel tests/test-c.o
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative parallel finetune export-lora tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
|
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
|
||||||
|
@ -500,6 +500,9 @@ console.o: common/console.cpp common/console.h
|
||||||
grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
|
grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
train.o: common/train.cpp common/train.h
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
libllama.so: llama.o ggml.o $(OBJS)
|
libllama.so: llama.o ggml.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||||
|
|
||||||
|
@ -550,7 +553,7 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te
|
||||||
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o $(OBJS)
|
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS)
|
||||||
$(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(TTFS_CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS)
|
||||||
|
@ -559,12 +562,18 @@ convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggm
|
||||||
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o $(OBJS)
|
baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o train.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
|
speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
|
|
|
@ -9,6 +9,8 @@ add_library(${TARGET} OBJECT
|
||||||
console.cpp
|
console.cpp
|
||||||
grammar-parser.h
|
grammar-parser.h
|
||||||
grammar-parser.cpp
|
grammar-parser.cpp
|
||||||
|
train.h
|
||||||
|
train.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
|
|
|
@ -78,7 +78,7 @@ int32_t get_num_physical_cores() {
|
||||||
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_escapes(std::string& input) {
|
void process_escapes(std::string& input) {
|
||||||
std::size_t input_len = input.length();
|
std::size_t input_len = input.length();
|
||||||
std::size_t output_idx = 0;
|
std::size_t output_idx = 0;
|
||||||
|
|
||||||
|
@ -361,7 +361,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.lora_adapter = argv[i];
|
params.lora_adapter.push_back({argv[i], 1.0f});
|
||||||
|
params.use_mmap = false;
|
||||||
|
} else if (arg == "--lora-scaled") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const char * lora_adapter = argv[i];
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
} else if (arg == "--lora-base") {
|
} else if (arg == "--lora-base") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
|
@ -707,6 +719,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --verbose-prompt print prompt before generation\n");
|
printf(" --verbose-prompt print prompt before generation\n");
|
||||||
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
|
||||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
|
printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
|
||||||
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
|
||||||
printf(" -m FNAME, --model FNAME\n");
|
printf(" -m FNAME, --model FNAME\n");
|
||||||
printf(" model path (default: %s)\n", params.model.c_str());
|
printf(" model path (default: %s)\n", params.model.c_str());
|
||||||
|
@ -802,10 +815,15 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
return std::make_tuple(nullptr, nullptr);
|
return std::make_tuple(nullptr, nullptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!params.lora_adapter.empty()) {
|
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
|
||||||
|
const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]);
|
||||||
|
float lora_scale = std::get<1>(params.lora_adapter[i]);
|
||||||
int err = llama_model_apply_lora_from_file(model,
|
int err = llama_model_apply_lora_from_file(model,
|
||||||
params.lora_adapter.c_str(),
|
lora_adapter.c_str(),
|
||||||
params.lora_base.empty() ? NULL : params.lora_base.c_str(),
|
lora_scale,
|
||||||
|
((i > 0) || params.lora_base.empty())
|
||||||
|
? NULL
|
||||||
|
: params.lora_base.c_str(),
|
||||||
params.n_threads);
|
params.n_threads);
|
||||||
if (err != 0) {
|
if (err != 0) {
|
||||||
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
|
||||||
|
@ -1258,7 +1276,20 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
||||||
fprintf(stream, " %d: %f", lb.first, lb.second);
|
fprintf(stream, " %d: %f", lb.first, lb.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stream, "lora: %s\n", params.lora_adapter.c_str());
|
fprintf(stream, "lora:\n");
|
||||||
|
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
||||||
|
if (std::get<1>(la) != 1.0f) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
|
||||||
|
}
|
||||||
|
fprintf(stream, "lora_scaled:\n");
|
||||||
|
for (std::tuple<std::string, float> la : params.lora_adapter) {
|
||||||
|
if (std::get<1>(la) == 1.0f) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
|
||||||
|
}
|
||||||
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
|
||||||
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
|
||||||
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
|
||||||
|
|
|
@ -86,8 +86,8 @@ struct gpt_params {
|
||||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||||
std::string logdir = ""; // directory in which to save YAML log files
|
std::string logdir = ""; // directory in which to save YAML log files
|
||||||
|
|
||||||
std::string lora_adapter = ""; // lora adapter path
|
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||||
std::string lora_base = ""; // base model path for the lora adapter
|
std::string lora_base = ""; // base model path for the lora adapter
|
||||||
|
|
||||||
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||||
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||||
|
@ -130,6 +130,8 @@ std::string get_system_info(const gpt_params & params);
|
||||||
|
|
||||||
std::string gpt_random_prompt(std::mt19937 & rng);
|
std::string gpt_random_prompt(std::mt19937 & rng);
|
||||||
|
|
||||||
|
void process_escapes(std::string& input);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
1496
common/train.cpp
Normal file
1496
common/train.cpp
Normal file
File diff suppressed because it is too large
Load diff
230
common/train.h
Normal file
230
common/train.h
Normal file
|
@ -0,0 +1,230 @@
|
||||||
|
// Various helper functions and utilities for training
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <random>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
typedef std::string mt19937_state;
|
||||||
|
|
||||||
|
struct train_state {
|
||||||
|
struct ggml_opt_context * opt;
|
||||||
|
|
||||||
|
uint64_t train_its;
|
||||||
|
uint64_t train_samples;
|
||||||
|
uint64_t train_tokens;
|
||||||
|
uint64_t train_epochs;
|
||||||
|
|
||||||
|
size_t shuffle_samples_hash; // fn, sample_count, *zip(sample_begins, sample_sizes)
|
||||||
|
mt19937_state shuffle_rng_state_current;
|
||||||
|
mt19937_state shuffle_rng_state_next;
|
||||||
|
size_t shuffle_sample_count;
|
||||||
|
size_t shuffle_next_sample;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct train_params_common {
|
||||||
|
const char * fn_train_data;
|
||||||
|
const char * fn_checkpoint_in;
|
||||||
|
const char * fn_checkpoint_out;
|
||||||
|
const char * pattern_fn_it;
|
||||||
|
const char * fn_latest;
|
||||||
|
|
||||||
|
bool print_usage;
|
||||||
|
|
||||||
|
int save_every;
|
||||||
|
|
||||||
|
uint32_t seed;
|
||||||
|
|
||||||
|
int n_ctx;
|
||||||
|
int n_threads;
|
||||||
|
int n_batch;
|
||||||
|
int n_gradient_accumulation;
|
||||||
|
int n_epochs;
|
||||||
|
|
||||||
|
bool custom_n_ctx;
|
||||||
|
|
||||||
|
bool use_flash;
|
||||||
|
bool use_checkpointing;
|
||||||
|
|
||||||
|
std::string sample_start;
|
||||||
|
bool include_sample_start;
|
||||||
|
bool escape;
|
||||||
|
bool overlapping_samples;
|
||||||
|
bool fill_with_next_samples;
|
||||||
|
bool separate_with_eos;
|
||||||
|
bool separate_with_bos;
|
||||||
|
bool sample_random_offsets;
|
||||||
|
|
||||||
|
bool force_reshuffle;
|
||||||
|
|
||||||
|
int warmup;
|
||||||
|
int cos_decay_steps;
|
||||||
|
float cos_decay_restart;
|
||||||
|
float cos_decay_min;
|
||||||
|
bool enable_restart;
|
||||||
|
|
||||||
|
int opt_past;
|
||||||
|
float opt_delta;
|
||||||
|
int opt_max_no_improvement;
|
||||||
|
|
||||||
|
int adam_n_iter;
|
||||||
|
float adam_alpha;
|
||||||
|
float adam_min_alpha;
|
||||||
|
float adam_decay;
|
||||||
|
int adam_decay_min_ndim;
|
||||||
|
float adam_beta1;
|
||||||
|
float adam_beta2;
|
||||||
|
float adam_gclip;
|
||||||
|
float adam_eps_f;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef void (*save_train_files_callback)(void * data, struct train_state * train);
|
||||||
|
|
||||||
|
struct train_opt_callback_data {
|
||||||
|
struct train_params_common * params;
|
||||||
|
struct train_state * train;
|
||||||
|
save_train_files_callback save_cb;
|
||||||
|
void * save_data;
|
||||||
|
struct llama_context * lctx;
|
||||||
|
int last_save_iter;
|
||||||
|
llama_token * tokens_data;
|
||||||
|
size_t tokens_size;
|
||||||
|
size_t * samples_begin;
|
||||||
|
size_t * samples_size;
|
||||||
|
size_t * shuffled_samples_offs;
|
||||||
|
size_t * shuffled_samples_begin;
|
||||||
|
size_t * shuffled_samples_size;
|
||||||
|
size_t samples_count;
|
||||||
|
struct ggml_tensor * tokens_input;
|
||||||
|
struct ggml_tensor * target_probs;
|
||||||
|
int first_iter;
|
||||||
|
int first_epoch;
|
||||||
|
int iter_at_last_epoch;
|
||||||
|
int64_t last_time;
|
||||||
|
double millis_per_iter;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct train_state * init_train_state();
|
||||||
|
void free_train_state(struct train_state * state);
|
||||||
|
|
||||||
|
struct train_params_common get_default_train_params_common();
|
||||||
|
void print_common_train_usage(int /*argc*/, char ** argv, const struct train_params_common * params);
|
||||||
|
|
||||||
|
bool consume_common_train_arg(int argc, char ** argv, int * idx, struct train_params_common * params, bool * invalid_param);
|
||||||
|
void finish_processing_train_args(struct train_params_common * params);
|
||||||
|
|
||||||
|
struct random_normal_distribution;
|
||||||
|
struct random_uniform_distribution;
|
||||||
|
|
||||||
|
struct random_normal_distribution * init_random_normal_distribution (int seed, float mean, float std, float min, float max);
|
||||||
|
struct random_uniform_distribution * init_random_uniform_distribution(int seed, float min, float max);
|
||||||
|
|
||||||
|
void free_random_normal_distribution (struct random_normal_distribution * rnd);
|
||||||
|
void free_random_uniform_distribution(struct random_uniform_distribution * rnd);
|
||||||
|
|
||||||
|
struct ggml_tensor * randomize_tensor_normal (struct ggml_tensor * tensor, struct random_normal_distribution * rnd);
|
||||||
|
struct ggml_tensor * randomize_tensor_uniform(struct ggml_tensor * tensor, struct random_uniform_distribution * rnd);
|
||||||
|
|
||||||
|
// generate random float in interval [0,1)
|
||||||
|
float frand();
|
||||||
|
float frand_normal (struct random_normal_distribution * rnd);
|
||||||
|
float frand_uniform(struct random_uniform_distribution * rnd);
|
||||||
|
|
||||||
|
int clamp (const int v, const int min, const int max);
|
||||||
|
float fclamp(const float v, const float min, const float max);
|
||||||
|
|
||||||
|
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0);
|
||||||
|
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1);
|
||||||
|
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2);
|
||||||
|
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3);
|
||||||
|
|
||||||
|
size_t tokenize_file(
|
||||||
|
struct llama_context * lctx,
|
||||||
|
const char * filename,
|
||||||
|
const std::string & sample_start,
|
||||||
|
bool include_sample_start,
|
||||||
|
bool overlapping_samples,
|
||||||
|
unsigned context_length,
|
||||||
|
std::vector<llama_token> & out_tokens,
|
||||||
|
std::vector<size_t> & out_samples_begin,
|
||||||
|
std::vector<size_t> & out_samples_size);
|
||||||
|
|
||||||
|
int64_t get_example_targets_batch(
|
||||||
|
struct llama_context * lctx,
|
||||||
|
struct ggml_tensor * tokens_input,
|
||||||
|
struct ggml_tensor * target_probs,
|
||||||
|
int64_t example_id,
|
||||||
|
const size_t * samples_offs,
|
||||||
|
const size_t * samples_begin,
|
||||||
|
const size_t * samples_size,
|
||||||
|
size_t samples_count,
|
||||||
|
const llama_token * train_data,
|
||||||
|
size_t n_train_data,
|
||||||
|
bool separate_with_eos,
|
||||||
|
bool separate_with_bos,
|
||||||
|
bool fill_with_next_samples,
|
||||||
|
bool sample_random_offsets);
|
||||||
|
|
||||||
|
|
||||||
|
void mt19937_set_state(std::mt19937& rng, const mt19937_state& rng_state);
|
||||||
|
mt19937_state mt19937_get_state(const std::mt19937& rng);
|
||||||
|
mt19937_state mt19937_seed_to_state(unsigned seed);
|
||||||
|
|
||||||
|
mt19937_state shuffle_samples(
|
||||||
|
const mt19937_state & rng_state,
|
||||||
|
size_t * shuffled_offs,
|
||||||
|
size_t * shuffled_begins,
|
||||||
|
size_t * shuffled_sizes,
|
||||||
|
const size_t * begins,
|
||||||
|
const size_t * sizes,
|
||||||
|
size_t count);
|
||||||
|
|
||||||
|
size_t hash_combine(size_t h1, size_t h2);
|
||||||
|
|
||||||
|
size_t compute_samples_hash(
|
||||||
|
const char* fn,
|
||||||
|
const size_t* samples_begin,
|
||||||
|
const size_t* samples_size,
|
||||||
|
size_t sample_count);
|
||||||
|
|
||||||
|
|
||||||
|
std::string replace_str(const char * s, const char * needle, const char * replacement);
|
||||||
|
|
||||||
|
void print_duration(double milliseconds);
|
||||||
|
|
||||||
|
float cosine_decay(
|
||||||
|
int64_t step,
|
||||||
|
int64_t decay_steps,
|
||||||
|
float minimum);
|
||||||
|
|
||||||
|
float cosine_decay_restart(
|
||||||
|
int64_t step,
|
||||||
|
int64_t decay_steps,
|
||||||
|
float minimum,
|
||||||
|
float restart_step_mult);
|
||||||
|
|
||||||
|
float learning_schedule(
|
||||||
|
int64_t step,
|
||||||
|
int64_t warmup_steps,
|
||||||
|
int64_t decay_steps,
|
||||||
|
float learning_rate,
|
||||||
|
float overall_minimum,
|
||||||
|
float cos_decay_minimum,
|
||||||
|
float cos_decay_restart_step_mult,
|
||||||
|
bool enable_restart);
|
||||||
|
|
||||||
|
void copy_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name);
|
||||||
|
|
||||||
|
void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt);
|
||||||
|
void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt);
|
||||||
|
|
||||||
|
bool load_train_state_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct train_state * train);
|
||||||
|
void save_train_state_gguf(struct gguf_context * fctx, struct train_state * train);
|
||||||
|
|
||||||
|
std::string get_train_filename(const char * filename, const char * pattern_it, const char * latest, int64_t iteration);
|
||||||
|
|
||||||
|
void train_opt_callback(void * vdata, int accum_step, float * sched, bool * cancel);
|
|
@ -133,8 +133,6 @@ gguf_writer.add_file_type(ftype)
|
||||||
print("gguf: get tokenizer metadata")
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
tokens: list[bytearray] = []
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
tokenizer_json_file = dir_model / 'tokenizer.json'
|
||||||
if not tokenizer_json_file.is_file():
|
if not tokenizer_json_file.is_file():
|
||||||
|
@ -177,12 +175,8 @@ for i in range(vocab_size):
|
||||||
text = bytearray(pad_token)
|
text = bytearray(pad_token)
|
||||||
|
|
||||||
tokens.append(text)
|
tokens.append(text)
|
||||||
scores.append(0.0) # dymmy
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
|
@ -117,8 +117,6 @@ gguf_writer.add_file_type(ftype)
|
||||||
print("gguf: get tokenizer metadata")
|
print("gguf: get tokenizer metadata")
|
||||||
|
|
||||||
tokens: list[bytearray] = []
|
tokens: list[bytearray] = []
|
||||||
scores: list[float] = []
|
|
||||||
toktypes: list[int] = []
|
|
||||||
|
|
||||||
tokenizer_json_file = dir_model / 'tokenizer.json'
|
tokenizer_json_file = dir_model / 'tokenizer.json'
|
||||||
if not tokenizer_json_file.is_file():
|
if not tokenizer_json_file.is_file():
|
||||||
|
@ -161,12 +159,8 @@ for i in range(vocab_size):
|
||||||
text = bytearray(pad_token)
|
text = bytearray(pad_token)
|
||||||
|
|
||||||
tokens.append(text)
|
tokens.append(text)
|
||||||
scores.append(0.0) # dymmy
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL) # dummy
|
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||||
special_vocab.add_to_gguf(gguf_writer)
|
special_vocab.add_to_gguf(gguf_writer)
|
||||||
|
|
|
@ -21,6 +21,7 @@ else()
|
||||||
add_subdirectory(benchmark)
|
add_subdirectory(benchmark)
|
||||||
add_subdirectory(baby-llama)
|
add_subdirectory(baby-llama)
|
||||||
add_subdirectory(train-text-from-scratch)
|
add_subdirectory(train-text-from-scratch)
|
||||||
|
add_subdirectory(finetune)
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
add_subdirectory(batched)
|
add_subdirectory(batched)
|
||||||
|
@ -35,4 +36,5 @@ else()
|
||||||
if (LLAMA_BUILD_SERVER)
|
if (LLAMA_BUILD_SERVER)
|
||||||
add_subdirectory(server)
|
add_subdirectory(server)
|
||||||
endif()
|
endif()
|
||||||
|
add_subdirectory(export-lora)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
#include "train.h"
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <random>
|
#include <random>
|
||||||
|
@ -14,31 +15,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
||||||
constexpr float rms_norm_eps = 5e-6f;
|
constexpr float rms_norm_eps = 5e-6f;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static float frand() {
|
|
||||||
return (float)rand()/(float)RAND_MAX;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct random_normal_distribution {
|
|
||||||
std::mt19937 gen;
|
|
||||||
std::normal_distribution<float> nd;
|
|
||||||
float min;
|
|
||||||
float max;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void init_random_normal_distribution(
|
|
||||||
struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
|
|
||||||
) {
|
|
||||||
rnd->gen = std::mt19937(seed);
|
|
||||||
rnd->nd = std::normal_distribution<float>{mean, std};
|
|
||||||
rnd->min = min;
|
|
||||||
rnd->max = max;
|
|
||||||
}
|
|
||||||
|
|
||||||
static float frand_normal(struct random_normal_distribution * rnd) {
|
|
||||||
const float r = rnd->nd(rnd->gen);
|
|
||||||
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
|
||||||
|
@ -93,54 +69,6 @@ static struct ggml_tensor * randomize_tensor(
|
||||||
return tensor;
|
return tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_tensor * randomize_tensor_normal(
|
|
||||||
struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
|
|
||||||
) {
|
|
||||||
float scale = 1.0; // xavier
|
|
||||||
switch (ndims) {
|
|
||||||
case 1:
|
|
||||||
scale /= sqrtf(ne[0]);
|
|
||||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
|
||||||
((float *)tensor->data)[i0] = scale * frand_normal(rnd);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
scale /= sqrtf(ne[0]+ne[1]);
|
|
||||||
for (int i1 = 0; i1 < ne[1]; i1++) {
|
|
||||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
|
||||||
((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case 3:
|
|
||||||
scale /= sqrtf(ne[0]+ne[1]);
|
|
||||||
for (int i2 = 0; i2 < ne[2]; i2++) {
|
|
||||||
for (int i1 = 0; i1 < ne[1]; i1++) {
|
|
||||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
|
||||||
((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
scale /= sqrtf(ne[0]+ne[1]);
|
|
||||||
for (int i3 = 0; i3 < ne[3]; i3++) {
|
|
||||||
for (int i2 = 0; i2 < ne[2]; i2++) {
|
|
||||||
for (int i1 = 0; i1 < ne[1]; i1++) {
|
|
||||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
|
||||||
((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
assert(false);
|
|
||||||
};
|
|
||||||
|
|
||||||
return tensor;
|
|
||||||
}
|
|
||||||
|
|
||||||
struct llama_hparams {
|
struct llama_hparams {
|
||||||
uint32_t n_vocab = 32000;
|
uint32_t n_vocab = 32000;
|
||||||
uint32_t n_ctx = 512; // this is provided as user input?
|
uint32_t n_ctx = 512; // this is provided as user input?
|
||||||
|
@ -398,27 +326,29 @@ static void randomize_model(struct llama_model * model, int seed, float mean, fl
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
|
||||||
struct random_normal_distribution rnd;
|
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
|
||||||
init_random_normal_distribution(&rnd, seed, mean, std, min, max);
|
|
||||||
randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
|
randomize_tensor_normal(model->tok_embeddings , rnd);
|
||||||
randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
|
randomize_tensor_normal(model->norm , rnd);
|
||||||
randomize_tensor_normal(model->output, model->output->n_dims, model->output->ne, &rnd);
|
randomize_tensor_normal(model->output , rnd);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
auto & layer = model->layers[i];
|
auto & layer = model->layers[i];
|
||||||
randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
|
randomize_tensor_normal(layer.attention_norm, rnd);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.wq, layer.wq->n_dims, layer.wq->ne, &rnd);
|
randomize_tensor_normal(layer.wq, rnd);
|
||||||
randomize_tensor_normal(layer.wk, layer.wk->n_dims, layer.wk->ne, &rnd);
|
randomize_tensor_normal(layer.wk, rnd);
|
||||||
randomize_tensor_normal(layer.wv, layer.wv->n_dims, layer.wv->ne, &rnd);
|
randomize_tensor_normal(layer.wv, rnd);
|
||||||
randomize_tensor_normal(layer.wo, layer.wo->n_dims, layer.wo->ne, &rnd);
|
randomize_tensor_normal(layer.wo, rnd);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
|
randomize_tensor_normal(layer.ffn_norm, rnd);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
|
randomize_tensor_normal(layer.w1, rnd);
|
||||||
randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
|
randomize_tensor_normal(layer.w2, rnd);
|
||||||
randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
|
randomize_tensor_normal(layer.w3, rnd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
free_random_normal_distribution(rnd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -429,32 +359,34 @@ static void randomize_model_lora(
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
|
||||||
struct random_normal_distribution rnd;
|
struct random_normal_distribution * rnd = init_random_normal_distribution(seed, mean, std, min, max);
|
||||||
init_random_normal_distribution(&rnd, seed, mean, std, min, max);
|
|
||||||
randomize_tensor_normal(model->tok_embeddings, model->tok_embeddings->n_dims, model->tok_embeddings->ne, &rnd);
|
randomize_tensor_normal(model->tok_embeddings, rnd);
|
||||||
randomize_tensor_normal(model->norm, model->norm->n_dims, model->norm->ne, &rnd);
|
randomize_tensor_normal(model->norm , rnd);
|
||||||
randomize_tensor_normal(model->outputa, model->outputa->n_dims, model->outputa->ne, &rnd);
|
randomize_tensor_normal(model->outputa , rnd);
|
||||||
randomize_tensor_normal(model->outputb, model->outputb->n_dims, model->outputb->ne, &rnd);
|
randomize_tensor_normal(model->outputb , rnd);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
auto & layer = model->layers[i];
|
auto & layer = model->layers[i];
|
||||||
randomize_tensor_normal(layer.attention_norm, layer.attention_norm->n_dims, layer.attention_norm->ne, &rnd);
|
randomize_tensor_normal(layer.attention_norm, rnd);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.wqa, layer.wqa->n_dims, layer.wqa->ne, &rnd);
|
randomize_tensor_normal(layer.wqa, rnd);
|
||||||
randomize_tensor_normal(layer.wqb, layer.wqb->n_dims, layer.wqb->ne, &rnd);
|
randomize_tensor_normal(layer.wqb, rnd);
|
||||||
randomize_tensor_normal(layer.wka, layer.wka->n_dims, layer.wka->ne, &rnd);
|
randomize_tensor_normal(layer.wka, rnd);
|
||||||
randomize_tensor_normal(layer.wkb, layer.wkb->n_dims, layer.wkb->ne, &rnd);
|
randomize_tensor_normal(layer.wkb, rnd);
|
||||||
randomize_tensor_normal(layer.wva, layer.wva->n_dims, layer.wva->ne, &rnd);
|
randomize_tensor_normal(layer.wva, rnd);
|
||||||
randomize_tensor_normal(layer.wvb, layer.wvb->n_dims, layer.wvb->ne, &rnd);
|
randomize_tensor_normal(layer.wvb, rnd);
|
||||||
randomize_tensor_normal(layer.woa, layer.woa->n_dims, layer.woa->ne, &rnd);
|
randomize_tensor_normal(layer.woa, rnd);
|
||||||
randomize_tensor_normal(layer.wob, layer.wob->n_dims, layer.wob->ne, &rnd);
|
randomize_tensor_normal(layer.wob, rnd);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.ffn_norm, layer.ffn_norm->n_dims, layer.ffn_norm->ne, &rnd);
|
randomize_tensor_normal(layer.ffn_norm, rnd);
|
||||||
|
|
||||||
randomize_tensor_normal(layer.w1, layer.w1->n_dims, layer.w1->ne, &rnd);
|
randomize_tensor_normal(layer.w1, rnd);
|
||||||
randomize_tensor_normal(layer.w2, layer.w2->n_dims, layer.w2->ne, &rnd);
|
randomize_tensor_normal(layer.w2, rnd);
|
||||||
randomize_tensor_normal(layer.w3, layer.w3->n_dims, layer.w3->ne, &rnd);
|
randomize_tensor_normal(layer.w3, rnd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
free_random_normal_distribution(rnd);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
|
static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
|
||||||
|
@ -762,32 +694,6 @@ static struct ggml_tensor * forward(
|
||||||
return inpL;
|
return inpL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
|
||||||
GGML_ASSERT(tensor->n_dims == 1);
|
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
|
||||||
GGML_ASSERT(tensor->n_dims == 2);
|
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
|
||||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
|
||||||
GGML_ASSERT(tensor->n_dims == 3);
|
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
|
||||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
|
||||||
GGML_ASSERT(tensor->ne[2] == ne2);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
|
||||||
GGML_ASSERT(tensor->n_dims == 4);
|
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
|
||||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
|
||||||
GGML_ASSERT(tensor->ne[2] == ne2);
|
|
||||||
GGML_ASSERT(tensor->ne[3] == ne3);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct ggml_tensor * forward_batch(
|
static struct ggml_tensor * forward_batch(
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
struct llama_kv_cache * cache,
|
struct llama_kv_cache * cache,
|
||||||
|
|
5
examples/export-lora/CMakeLists.txt
Normal file
5
examples/export-lora/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET export-lora)
|
||||||
|
add_executable(${TARGET} export-lora.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
26
examples/export-lora/README.md
Normal file
26
examples/export-lora/README.md
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
# export-lora
|
||||||
|
|
||||||
|
Apply LORA adapters to base model and export the resulting model.
|
||||||
|
|
||||||
|
```
|
||||||
|
usage: export-lora [options]
|
||||||
|
|
||||||
|
options:
|
||||||
|
-h, --help show this help message and exit
|
||||||
|
-m FNAME, --model-base FNAME model path from which to load base model (default '')
|
||||||
|
-o FNAME, --model-out FNAME path to save exported model (default '')
|
||||||
|
-l FNAME, --lora FNAME apply LoRA adapter
|
||||||
|
-s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S
|
||||||
|
-t N, --threads N number of threads to use during computation (default: 4)
|
||||||
|
```
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./bin/export-lora \
|
||||||
|
-m open-llama-3b-v2-q8_0.gguf \
|
||||||
|
-o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
|
||||||
|
-l lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.bin
|
||||||
|
```
|
||||||
|
|
||||||
|
Multiple LORA adapters can be applied by passing multiple `-l FN` or `-s FN S` command line parameters.
|
474
examples/export-lora/export-lora.cpp
Normal file
474
examples/export-lora/export-lora.cpp
Normal file
|
@ -0,0 +1,474 @@
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "ggml-alloc.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
static const size_t tensor_alignment = 32;
|
||||||
|
|
||||||
|
struct lora_info {
|
||||||
|
std::string filename;
|
||||||
|
float scale;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct export_lora_params {
|
||||||
|
std::string fn_model_base;
|
||||||
|
std::string fn_model_out;
|
||||||
|
std::vector<struct lora_info> lora;
|
||||||
|
int n_threads;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct lora_data {
|
||||||
|
struct lora_info info;
|
||||||
|
std::vector<uint8_t> data;
|
||||||
|
struct ggml_context * ctx;
|
||||||
|
|
||||||
|
uint32_t lora_r;
|
||||||
|
uint32_t lora_alpha;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_file {
|
||||||
|
// use FILE * so we don't have to re-open the file to mmap
|
||||||
|
FILE * fp;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
|
llama_file(const char * fname, const char * mode) {
|
||||||
|
fp = std::fopen(fname, mode);
|
||||||
|
if (fp == NULL) {
|
||||||
|
size = 0;
|
||||||
|
} else {
|
||||||
|
seek(0, SEEK_END);
|
||||||
|
size = tell();
|
||||||
|
seek(0, SEEK_SET);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t tell() const {
|
||||||
|
#ifdef _WIN32
|
||||||
|
__int64 ret = _ftelli64(fp);
|
||||||
|
#else
|
||||||
|
long ret = std::ftell(fp);
|
||||||
|
#endif
|
||||||
|
GGML_ASSERT(ret != -1); // this really shouldn't fail
|
||||||
|
return (size_t) ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void seek(size_t offset, int whence) {
|
||||||
|
#ifdef _WIN32
|
||||||
|
int ret = _fseeki64(fp, (__int64) offset, whence);
|
||||||
|
#else
|
||||||
|
int ret = std::fseek(fp, (long) offset, whence);
|
||||||
|
#endif
|
||||||
|
GGML_ASSERT(ret == 0); // same
|
||||||
|
}
|
||||||
|
|
||||||
|
void read_raw(void * ptr, size_t size) {
|
||||||
|
if (size == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
errno = 0;
|
||||||
|
std::size_t ret = std::fread(ptr, size, 1, fp);
|
||||||
|
if (ferror(fp)) {
|
||||||
|
die_fmt("read error: %s", strerror(errno));
|
||||||
|
}
|
||||||
|
if (ret != 1) {
|
||||||
|
die("unexpectedly reached end of file");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::uint32_t read_u32() {
|
||||||
|
std::uint32_t ret;
|
||||||
|
read_raw(&ret, sizeof(ret));
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string read_string(std::uint32_t len) {
|
||||||
|
std::vector<char> chars(len);
|
||||||
|
read_raw(chars.data(), len);
|
||||||
|
return std::string(chars.data(), len);
|
||||||
|
}
|
||||||
|
|
||||||
|
void write_raw(const void * ptr, size_t size) {
|
||||||
|
if (size == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
errno = 0;
|
||||||
|
size_t ret = std::fwrite(ptr, size, 1, fp);
|
||||||
|
if (ret != 1) {
|
||||||
|
die_fmt("write error: %s", strerror(errno));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void write_u32(std::uint32_t val) {
|
||||||
|
write_raw(&val, sizeof(val));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool eof() {
|
||||||
|
return tell() >= size;
|
||||||
|
}
|
||||||
|
|
||||||
|
~llama_file() {
|
||||||
|
if (fp) {
|
||||||
|
std::fclose(fp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static struct export_lora_params get_default_export_lora_params() {
|
||||||
|
struct export_lora_params result;
|
||||||
|
result.fn_model_base = "";
|
||||||
|
result.fn_model_out = "";
|
||||||
|
result.n_threads = GGML_DEFAULT_N_THREADS;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void export_lora_print_usage(int /*argc*/, char ** argv, const struct export_lora_params * params) {
|
||||||
|
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "options:\n");
|
||||||
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||||
|
fprintf(stderr, " -m FNAME, --model-base FNAME model path from which to load base model (default '%s')\n", params->fn_model_base.c_str());
|
||||||
|
fprintf(stderr, " -o FNAME, --model-out FNAME path to save exported model (default '%s')\n", params->fn_model_out.c_str());
|
||||||
|
fprintf(stderr, " -l FNAME, --lora FNAME apply LoRA adapter\n");
|
||||||
|
fprintf(stderr, " -s FNAME S, --lora-scaled FNAME S apply LoRA adapter with user defined scaling S\n");
|
||||||
|
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params->n_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool export_lora_params_parse(int argc, char ** argv, struct export_lora_params * params) {
|
||||||
|
bool invalid_param = false;
|
||||||
|
std::string arg;
|
||||||
|
struct export_lora_params default_params = get_default_export_lora_params();
|
||||||
|
const std::string arg_prefix = "--";
|
||||||
|
|
||||||
|
for (int i = 1; i < argc; i++) {
|
||||||
|
arg = argv[i];
|
||||||
|
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
|
||||||
|
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (arg == "-m" || arg == "--model-base") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params->fn_model_base = argv[i];
|
||||||
|
} else if (arg == "-o" || arg == "--model-out") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params->fn_model_out = argv[i];
|
||||||
|
} else if (arg == "-l" || arg == "--lora") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
struct lora_info lora;
|
||||||
|
lora.filename = argv[i];
|
||||||
|
lora.scale = 1.0f;
|
||||||
|
params->lora.push_back(lora);
|
||||||
|
} else if (arg == "-s" || arg == "--lora-scaled") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
struct lora_info lora;
|
||||||
|
lora.filename = argv[i];
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
lora.scale = std::stof(argv[i]);
|
||||||
|
params->lora.push_back(lora);
|
||||||
|
} else if (arg == "-t" || arg == "--threads") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params->n_threads = std::stoi(argv[i]);
|
||||||
|
if (params->n_threads <= 0) {
|
||||||
|
params->n_threads = std::thread::hardware_concurrency();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "error: unknown argument: '%s'\n", arg.c_str());
|
||||||
|
export_lora_print_usage(argc, argv, &default_params);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params->fn_model_base == default_params.fn_model_base) {
|
||||||
|
fprintf(stderr, "error: please specify a filename for model-base.\n");
|
||||||
|
export_lora_print_usage(argc, argv, &default_params);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (params->fn_model_out == default_params.fn_model_out) {
|
||||||
|
fprintf(stderr, "error: please specify a filename for model-out.\n");
|
||||||
|
export_lora_print_usage(argc, argv, &default_params);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
if (invalid_param) {
|
||||||
|
fprintf(stderr, "error: invalid parameter for argument: '%s'\n", arg.c_str());
|
||||||
|
export_lora_print_usage(argc, argv, &default_params);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void free_lora(struct lora_data * lora) {
|
||||||
|
if (lora->ctx != NULL) {
|
||||||
|
ggml_free(lora->ctx);
|
||||||
|
}
|
||||||
|
delete lora;
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct lora_data * load_lora(struct lora_info * info) {
|
||||||
|
struct lora_data * result = new struct lora_data;
|
||||||
|
result->info = *info;
|
||||||
|
result->ctx = NULL;
|
||||||
|
result->lora_r = 1;
|
||||||
|
result->lora_alpha = 1;
|
||||||
|
|
||||||
|
struct llama_file file(info->filename.c_str(), "rb");
|
||||||
|
if (file.fp == NULL) {
|
||||||
|
fprintf(stderr, "warning: Could not open lora adapter '%s'. Ignoring this adapter.\n",
|
||||||
|
info->filename.c_str());
|
||||||
|
free_lora(result);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_init_params params_ggml;
|
||||||
|
params_ggml.mem_size = ggml_tensor_overhead() * GGML_MAX_NODES;
|
||||||
|
params_ggml.mem_buffer = NULL;
|
||||||
|
params_ggml.no_alloc = true;
|
||||||
|
result->ctx = ggml_init(params_ggml);
|
||||||
|
|
||||||
|
uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
|
||||||
|
uint32_t magic = file.read_u32();
|
||||||
|
if (magic != LLAMA_FILE_MAGIC_LORA) {
|
||||||
|
die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str());
|
||||||
|
}
|
||||||
|
uint32_t version = file.read_u32();
|
||||||
|
if (version != 1) {
|
||||||
|
die_fmt("unexpected lora file version '%u' in '%s'", (unsigned) version, info->filename.c_str());
|
||||||
|
}
|
||||||
|
result->lora_r = file.read_u32();
|
||||||
|
result->lora_alpha = file.read_u32();
|
||||||
|
// read tensor infos from file
|
||||||
|
std::vector<char> name_buf;
|
||||||
|
std::vector<struct ggml_tensor *> tensors;
|
||||||
|
std::vector<size_t> tensors_offset;
|
||||||
|
size_t total_nbytes_pad = 0;
|
||||||
|
while(!file.eof()) {
|
||||||
|
int64_t ne[4] = {1,1,1,1};
|
||||||
|
uint32_t n_dims = file.read_u32();
|
||||||
|
uint32_t namelen = file.read_u32();
|
||||||
|
uint32_t type = file.read_u32();
|
||||||
|
for (uint32_t k = 0; k < n_dims; ++k) {
|
||||||
|
ne[k] = (int64_t)file.read_u32();
|
||||||
|
}
|
||||||
|
name_buf.clear();
|
||||||
|
name_buf.resize(namelen + 1, '\0');
|
||||||
|
file.read_raw(name_buf.data(), namelen);
|
||||||
|
file.seek((0-file.tell()) & 31, SEEK_CUR);
|
||||||
|
size_t offset = file.tell();
|
||||||
|
struct ggml_tensor * tensor = ggml_new_tensor(result->ctx, (enum ggml_type) type, n_dims, ne);
|
||||||
|
ggml_set_name(tensor, name_buf.data());
|
||||||
|
size_t nbytes = ggml_nbytes(tensor);
|
||||||
|
size_t nbytes_pad = ggml_nbytes_pad(tensor);
|
||||||
|
total_nbytes_pad += nbytes_pad;
|
||||||
|
tensors.push_back(tensor);
|
||||||
|
tensors_offset.push_back(offset);
|
||||||
|
file.seek(nbytes, SEEK_CUR);
|
||||||
|
}
|
||||||
|
// read tensor data
|
||||||
|
result->data.resize(total_nbytes_pad);
|
||||||
|
size_t data_offset = 0;
|
||||||
|
for (size_t i = 0; i < tensors.size(); ++i) {
|
||||||
|
struct ggml_tensor * tensor = tensors[i];
|
||||||
|
size_t offset = tensors_offset[i];
|
||||||
|
size_t nbytes = ggml_nbytes(tensor);
|
||||||
|
size_t nbytes_pad = ggml_nbytes_pad(tensor);
|
||||||
|
file.seek(offset, SEEK_SET);
|
||||||
|
tensor->data = result->data.data() + data_offset;
|
||||||
|
file.read_raw(tensor->data, nbytes);
|
||||||
|
data_offset += nbytes_pad;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static struct ggml_cgraph * build_graph_lora(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * tensor,
|
||||||
|
struct ggml_tensor * lora_a,
|
||||||
|
struct ggml_tensor * lora_b,
|
||||||
|
float scaling
|
||||||
|
) {
|
||||||
|
struct ggml_tensor * ab = ggml_mul_mat(ctx, lora_a, lora_b);
|
||||||
|
if (scaling != 1.0f) {
|
||||||
|
ab = ggml_scale(ctx, ab, ggml_new_f32(ctx, scaling));
|
||||||
|
}
|
||||||
|
struct ggml_tensor * res = ggml_add_inplace(ctx, tensor, ab);
|
||||||
|
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
||||||
|
ggml_build_forward_expand (gf, res);
|
||||||
|
return gf;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool apply_lora(struct ggml_tensor * tensor, struct lora_data * lora, int n_threads) {
|
||||||
|
if (lora->ctx == NULL) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
std::string name = ggml_get_name(tensor);
|
||||||
|
std::string name_a = name + std::string(".loraA");
|
||||||
|
std::string name_b = name + std::string(".loraB");
|
||||||
|
struct ggml_tensor * lora_a = ggml_get_tensor(lora->ctx, name_a.c_str());
|
||||||
|
struct ggml_tensor * lora_b = ggml_get_tensor(lora->ctx, name_b.c_str());
|
||||||
|
if (lora_a == NULL || lora_b == NULL) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
float scaling = lora->info.scale * (float)lora->lora_alpha / (float)lora->lora_r;
|
||||||
|
|
||||||
|
struct ggml_init_params params;
|
||||||
|
params.mem_size = GGML_OBJECT_SIZE + GGML_GRAPH_SIZE + ggml_tensor_overhead()*4 + GGML_MEM_ALIGN*5;
|
||||||
|
params.mem_buffer = NULL;
|
||||||
|
params.no_alloc = true;
|
||||||
|
struct ggml_context * ctx = NULL;
|
||||||
|
struct ggml_allocr * alloc = NULL;
|
||||||
|
struct ggml_cgraph * gf = NULL;
|
||||||
|
|
||||||
|
ctx = ggml_init(params);
|
||||||
|
alloc = ggml_allocr_new_measure(tensor_alignment);
|
||||||
|
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
|
||||||
|
size_t alloc_size = ggml_allocr_alloc_graph(alloc, gf);
|
||||||
|
ggml_allocr_free(alloc);
|
||||||
|
ggml_free(ctx);
|
||||||
|
|
||||||
|
static std::vector<uint8_t> data_compute;
|
||||||
|
data_compute.resize(alloc_size + tensor_alignment);
|
||||||
|
|
||||||
|
ctx = ggml_init(params);
|
||||||
|
alloc = ggml_allocr_new(data_compute.data(), data_compute.size(), tensor_alignment);
|
||||||
|
gf = build_graph_lora(ctx, tensor, lora_a, lora_b, scaling);
|
||||||
|
ggml_allocr_alloc_graph(alloc, gf);
|
||||||
|
ggml_allocr_free(alloc);
|
||||||
|
|
||||||
|
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads);
|
||||||
|
static std::vector<uint8_t> data_work;
|
||||||
|
data_work.resize(cplan.work_size);
|
||||||
|
cplan.work_data = data_work.data();
|
||||||
|
|
||||||
|
ggml_graph_compute(gf, &cplan);
|
||||||
|
|
||||||
|
ggml_free(ctx);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void export_lora(struct export_lora_params * params) {
|
||||||
|
// load all loras
|
||||||
|
std::vector<struct lora_data *> loras;
|
||||||
|
for (size_t i = 0; i < params->lora.size(); ++i) {
|
||||||
|
struct lora_data * lora = load_lora(¶ms->lora[i]);
|
||||||
|
if (lora != NULL) {
|
||||||
|
loras.push_back(lora);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (loras.size() == 0) {
|
||||||
|
fprintf(stderr, "warning: no lora adapters will be applied.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// open input file
|
||||||
|
struct llama_file fin(params->fn_model_base.c_str(), "rb");
|
||||||
|
if (!fin.fp) {
|
||||||
|
die_fmt("Could not open file '%s'\n", params->fn_model_base.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// open base model gguf, read tensors without their data
|
||||||
|
struct ggml_context * ctx_in;
|
||||||
|
struct gguf_init_params params_gguf;
|
||||||
|
params_gguf.no_alloc = true;
|
||||||
|
params_gguf.ctx = &ctx_in;
|
||||||
|
struct gguf_context * gguf_in = gguf_init_from_file(params->fn_model_base.c_str(), params_gguf);
|
||||||
|
|
||||||
|
// create new gguf
|
||||||
|
struct gguf_context * gguf_out = gguf_init_empty();
|
||||||
|
|
||||||
|
// copy meta data from base model: kv and tensors
|
||||||
|
gguf_set_kv(gguf_out, gguf_in);
|
||||||
|
int n_tensors = gguf_get_n_tensors(gguf_in);
|
||||||
|
for (int i=0; i < n_tensors; ++i) {
|
||||||
|
const char * name = gguf_get_tensor_name(gguf_in, i);
|
||||||
|
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
|
||||||
|
gguf_add_tensor(gguf_out, tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
// create output file
|
||||||
|
struct llama_file fout(params->fn_model_out.c_str(), "wb");
|
||||||
|
if (!fout.fp) {
|
||||||
|
die_fmt("Could not create file '%s'\n", params->fn_model_out.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// write gguf meta data
|
||||||
|
std::vector<uint8_t> meta;
|
||||||
|
meta.resize(gguf_get_meta_size(gguf_out));
|
||||||
|
gguf_get_meta_data(gguf_out, meta.data());
|
||||||
|
fout.write_raw(meta.data(), meta.size());
|
||||||
|
|
||||||
|
std::vector<uint8_t> data;
|
||||||
|
std::vector<uint8_t> padding;
|
||||||
|
for (int i=0; i < n_tensors; ++i) {
|
||||||
|
const char * name = gguf_get_tensor_name(gguf_in, i);
|
||||||
|
struct ggml_tensor * tensor = ggml_get_tensor(ctx_in, name);
|
||||||
|
|
||||||
|
// read tensor data
|
||||||
|
data.resize(ggml_nbytes(tensor));
|
||||||
|
tensor->data = data.data();
|
||||||
|
size_t offset = gguf_get_tensor_offset(gguf_in, i);
|
||||||
|
fin.seek(offset + meta.size(), SEEK_SET);
|
||||||
|
fin.read_raw(data.data(), data.size());
|
||||||
|
|
||||||
|
// apply all loras
|
||||||
|
for (size_t k = 0; k < loras.size(); ++k) {
|
||||||
|
apply_lora(tensor, loras[k], params->n_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
// write tensor data + padding
|
||||||
|
padding.clear();
|
||||||
|
padding.resize(GGML_PAD(data.size(), gguf_get_alignment(gguf_out)) - data.size(), 0);
|
||||||
|
|
||||||
|
GGML_ASSERT(fout.tell() == offset + meta.size());
|
||||||
|
// fout.seek(offset + meta.size(), SEEK_SET);
|
||||||
|
fout.write_raw(data.data(), data.size());
|
||||||
|
fout.write_raw(padding.data(), padding.size());
|
||||||
|
|
||||||
|
if (i % 2 == 0) {
|
||||||
|
printf(".");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
// close gguf
|
||||||
|
gguf_free(gguf_out);
|
||||||
|
gguf_free(gguf_in);
|
||||||
|
|
||||||
|
// free loras
|
||||||
|
for (size_t i = 0; i < loras.size(); ++i) {
|
||||||
|
free_lora(loras[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
struct export_lora_params params = get_default_export_lora_params();
|
||||||
|
|
||||||
|
if (!export_lora_params_parse(argc, argv, ¶ms)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
export_lora(¶ms);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
5
examples/finetune/CMakeLists.txt
Normal file
5
examples/finetune/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
set(TARGET finetune)
|
||||||
|
add_executable(${TARGET} finetune.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
90
examples/finetune/README.md
Normal file
90
examples/finetune/README.md
Normal file
|
@ -0,0 +1,90 @@
|
||||||
|
# finetune
|
||||||
|
|
||||||
|
Basic usage instructions:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# get training data
|
||||||
|
wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/shakespeare.txt
|
||||||
|
|
||||||
|
# finetune LORA adapter
|
||||||
|
./bin/finetune \
|
||||||
|
--model-base open-llama-3b-v2-q8_0.gguf \
|
||||||
|
--checkpoint-in chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf \
|
||||||
|
--checkpoint-out chk-lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.gguf \
|
||||||
|
--lora-out lora-open-llama-3b-v2-q8_0-shakespeare-ITERATION.bin \
|
||||||
|
--train-data "shakespeare.txt" \
|
||||||
|
--save-every 10 \
|
||||||
|
--threads 6 --adam-iter 30 --batch 4 --ctx 64 \
|
||||||
|
--use-checkpointing
|
||||||
|
|
||||||
|
# predict
|
||||||
|
./bin/main -m open-llama-3b-v2-q8_0.gguf --lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||||
|
```
|
||||||
|
|
||||||
|
Finetune output files will be saved every N iterations (config with `--save-every N`).
|
||||||
|
The pattern 'ITERATION' in the output filenames will be replaced with the iteration number and with 'LATEST' for the latest output.
|
||||||
|
So in above example after 10 iterations these files will be written:
|
||||||
|
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-10.gguf
|
||||||
|
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
|
||||||
|
- lora-open-llama-3b-v2-q8_0-shakespeare-10.bin
|
||||||
|
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||||
|
|
||||||
|
After 10 more iterations:
|
||||||
|
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-20.gguf
|
||||||
|
- chk-lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.gguf
|
||||||
|
- lora-open-llama-3b-v2-q8_0-shakespeare-20.bin
|
||||||
|
- lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin
|
||||||
|
|
||||||
|
Checkpoint files (`--checkpoint-in FN`, `--checkpoint-out FN`) store the training process. When the input checkpoint file does not exist, it will begin finetuning a new randomly initialized adapter.
|
||||||
|
|
||||||
|
llama.cpp compatible LORA adapters will be saved with filename specified by `--lora-out FN`.
|
||||||
|
These LORA adapters can then be used by `main` together with the base model, like in the 'predict' example command above.
|
||||||
|
|
||||||
|
In `main` you can also load multiple LORA adapters, which will then be mixed together.
|
||||||
|
|
||||||
|
For example if you have two LORA adapters `lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin` and `lora-open-llama-3b-v2-q8_0-bible-LATEST.bin`, you can mix them together like this:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
||||||
|
--lora lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin \
|
||||||
|
--lora lora-open-llama-3b-v2-q8_0-bible-LATEST.bin
|
||||||
|
```
|
||||||
|
|
||||||
|
You can change how strong each LORA adapter is applied to the base model by using `--lora-scaled FN SCALE` instead of `--lora FN`.
|
||||||
|
|
||||||
|
For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' LORA adapter and 100% of yet another one:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./bin/main -m open-llama-3b-v2-q8_0.gguf \
|
||||||
|
--lora-scaled lora-open-llama-3b-v2-q8_0-shakespeare-LATEST.bin 0.4 \
|
||||||
|
--lora-scaled lora-open-llama-3b-v2-q8_0-bible-LATEST.bin 0.8 \
|
||||||
|
--lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
|
||||||
|
```
|
||||||
|
|
||||||
|
The scale numbers don't need to add up to one, and you can also use numbers creater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
|
||||||
|
|
||||||
|
Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
|
||||||
|
If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.
|
||||||
|
|
||||||
|
The default LORA rank can be specified with `--lora-r N`.
|
||||||
|
The LORA rank can be configured for each model tensor type separately with these command line options:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
--lora-r N LORA r: default rank. Also specifies resulting scaling together with lora-alpha. (default 4)
|
||||||
|
--rank-att-norm N LORA rank for attention norm tensor (default 1)
|
||||||
|
--rank-ffn-norm N LORA rank for feed-forward norm tensor (default 1)
|
||||||
|
--rank-out-norm N LORA rank for output norm tensor (default 1)
|
||||||
|
--rank-tok-embd N LORA rank for token embeddings tensor (default 4)
|
||||||
|
--rank-out N LORA rank for output tensor (default 4)
|
||||||
|
--rank-wq N LORA rank for wq tensor (default 4)
|
||||||
|
--rank-wk N LORA rank for wk tensor (default 4)
|
||||||
|
--rank-wv N LORA rank for wv tensor (default 4)
|
||||||
|
--rank-wo N LORA rank for wo tensor (default 4)
|
||||||
|
--rank-w1 N LORA rank for w1 tensor (default 4)
|
||||||
|
--rank-w2 N LORA rank for w2 tensor (default 4)
|
||||||
|
--rank-w3 N LORA rank for w3 tensor (default 4)
|
||||||
|
```
|
||||||
|
|
||||||
|
The LORA rank of 'norm' tensors should always be 1.
|
||||||
|
|
||||||
|
To see all available options use `finetune --help`.
|
489
examples/finetune/convert-finetune-checkpoint-to-gguf.py
Normal file
489
examples/finetune/convert-finetune-checkpoint-to-gguf.py
Normal file
|
@ -0,0 +1,489 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# finetune checkpoint --> gguf conversion
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import gguf
|
||||||
|
import os
|
||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# gguf constants
|
||||||
|
LLM_KV_OPTIMIZER_TYPE = "optimizer.type"
|
||||||
|
LLM_KV_OPTIMIZER_TYPE_ADAM = "adam"
|
||||||
|
LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs"
|
||||||
|
LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version"
|
||||||
|
LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count"
|
||||||
|
LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count"
|
||||||
|
LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count"
|
||||||
|
LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized"
|
||||||
|
LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss"
|
||||||
|
LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss"
|
||||||
|
LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count"
|
||||||
|
LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count"
|
||||||
|
LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss"
|
||||||
|
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step"
|
||||||
|
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j"
|
||||||
|
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k"
|
||||||
|
LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end"
|
||||||
|
LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count"
|
||||||
|
|
||||||
|
LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments"
|
||||||
|
LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments"
|
||||||
|
LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values"
|
||||||
|
|
||||||
|
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters"
|
||||||
|
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters"
|
||||||
|
LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients"
|
||||||
|
LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients"
|
||||||
|
LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction"
|
||||||
|
LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values"
|
||||||
|
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha"
|
||||||
|
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
|
||||||
|
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
|
||||||
|
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
|
||||||
|
|
||||||
|
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
|
||||||
|
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
|
||||||
|
LLM_KV_TRAINING_TYPE = "training.type"
|
||||||
|
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
|
||||||
|
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
|
||||||
|
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
|
||||||
|
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
|
||||||
|
|
||||||
|
LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD = "training.lora.rank.token_embd"
|
||||||
|
LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM = "training.lora.rank.output_norm"
|
||||||
|
LLM_KV_TRAINING_LORA_RANK_OUTPUT = "training.lora.rank.output"
|
||||||
|
LLM_KV_TRAINING_LORA_RANK_ATTN_NORM = "training.lora.rank.attn_norm"
|
||||||
|
LLM_KV_TRAINING_LORA_RANK_ATTN_Q = "training.lora.rank.attn_q"
|
||||||
|
LLM_KV_TRAINING_LORA_RANK_ATTN_K = "training.lora.rank.attn_k"
|
||||||
|
LLM_KV_TRAINING_LORA_RANK_ATTN_V = "training.lora.rank.attn_v"
|
||||||
|
LLM_KV_TRAINING_LORA_RANK_ATTN_OUT = "training.lora.rank.attn_output"
|
||||||
|
LLM_KV_TRAINING_LORA_RANK_FFN_NORM = "training.lora.rank.ffn_norm"
|
||||||
|
LLM_KV_TRAINING_LORA_RANK_FFN_GATE = "training.lora.rank.ffn_gate"
|
||||||
|
LLM_KV_TRAINING_LORA_RANK_FFN_DOWN = "training.lora.rank.ffn_down"
|
||||||
|
LLM_KV_TRAINING_LORA_RANK_FFN_UP = "training.lora.rank.ffn_up"
|
||||||
|
|
||||||
|
class Tensor:
|
||||||
|
def __init__(self, dtype='f', ne=None):
|
||||||
|
if ne is None:
|
||||||
|
ne = []
|
||||||
|
self.dtype = dtype
|
||||||
|
self.ne = ne
|
||||||
|
self.nbytes = 0
|
||||||
|
if self.dtype == 'f':
|
||||||
|
if len(self.ne) == 0:
|
||||||
|
self.nbytes = 0
|
||||||
|
else:
|
||||||
|
self.nbytes = int(np.product(self.ne)) * 4
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
||||||
|
|
||||||
|
def load(self, data, offset):
|
||||||
|
nd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
namelen = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
dtype = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
|
||||||
|
assert(nd == len(self.ne))
|
||||||
|
ne = []
|
||||||
|
for d in range(nd):
|
||||||
|
n = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
ne.append(n)
|
||||||
|
|
||||||
|
if tuple(ne) != tuple(self.ne):
|
||||||
|
raise ValueError(f"Tensor.load: Expected number of elements {str(self.ne)} does not match what is read from file {str(ne)}")
|
||||||
|
|
||||||
|
if self.dtype == 'f':
|
||||||
|
assert(dtype == 0)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unhandled data type '{self.dtype}'")
|
||||||
|
|
||||||
|
self.name = bytes(data[offset:offset+namelen]); offset += namelen
|
||||||
|
# 32-byte alignment
|
||||||
|
offset += (0 - offset) & 31
|
||||||
|
self.data = data[offset:offset+self.nbytes]
|
||||||
|
offset += self.nbytes
|
||||||
|
return offset
|
||||||
|
|
||||||
|
def max_storage_size(self):
|
||||||
|
result = 0
|
||||||
|
result += 4 # nd
|
||||||
|
result += 4 # namelen
|
||||||
|
result += 4 # dtype
|
||||||
|
result += len(self.ne)*8 # ne
|
||||||
|
result += 48 # name (maximum as of commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9)
|
||||||
|
result += 31 # 32-byte alignment
|
||||||
|
result += self.nbytes
|
||||||
|
return result
|
||||||
|
|
||||||
|
def save_gguf(self, gguf_writer, name):
|
||||||
|
gguf_writer.add_tensor(
|
||||||
|
name=name,
|
||||||
|
tensor=self.data,
|
||||||
|
raw_shape=np.array(list(reversed(self.ne))),
|
||||||
|
raw_dtype=gguf.GGMLQuantizationType.F32)
|
||||||
|
|
||||||
|
class OptimizationContext:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def load(self, data, offset):
|
||||||
|
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]
|
||||||
|
offset += 4
|
||||||
|
|
||||||
|
if self.version != 1:
|
||||||
|
raise ValueError('Invalid version of optimization context in checkpoint file')
|
||||||
|
|
||||||
|
self.past = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.lbfgs_m = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.nx = struct.unpack('N', bytes(data[offset:offset + 8]))[0]; offset += 8
|
||||||
|
self.iter = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.just_initialized = bool(struct.unpack('<i', bytes(data[offset:offset + 4]))[0]); offset += 4
|
||||||
|
|
||||||
|
self.adam_m = Tensor('f', [self.nx])
|
||||||
|
self.adam_v = Tensor('f', [self.nx])
|
||||||
|
self.adam_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||||
|
|
||||||
|
self.lbfgs_x = Tensor('f', [self.nx])
|
||||||
|
self.lbfgs_xp = Tensor('f', [self.nx])
|
||||||
|
self.lbfgs_g = Tensor('f', [self.nx])
|
||||||
|
self.lbfgs_gp = Tensor('f', [self.nx])
|
||||||
|
self.lbfgs_d = Tensor('f', [self.nx])
|
||||||
|
self.lbfgs_pf = Tensor('f', [self.past] if self.past > 0 else [])
|
||||||
|
self.lbfgs_lmal = Tensor('f', [self.lbfgs_m])
|
||||||
|
self.lbfgs_lmys = Tensor('f', [self.lbfgs_m])
|
||||||
|
self.lbfgs_lms = Tensor('f', [self.nx, self.lbfgs_m])
|
||||||
|
self.lbfgs_lmy = Tensor('f', [self.nx, self.lbfgs_m])
|
||||||
|
|
||||||
|
# forgot to save type in version 1:
|
||||||
|
# guess self.type from number of remaining bytes
|
||||||
|
size_type_0 = 12 + sum([t.max_storage_size() for t in
|
||||||
|
[self.adam_m, self.adam_v]
|
||||||
|
+([self.adam_pf] if (self.past > 0) else [])])
|
||||||
|
size_type_1 = 24 + sum([t.max_storage_size() for t in
|
||||||
|
[self.lbfgs_x, self.lbfgs_xp, self.lbfgs_g,
|
||||||
|
self.lbfgs_gp, self.lbfgs_d, self.lbfgs_pf,
|
||||||
|
self.lbfgs_lmal, self.lbfgs_lmys,
|
||||||
|
self.lbfgs_lms, self.lbfgs_lmy]
|
||||||
|
+([self.lbfgs_pf] if (self.past > 0) else [])])
|
||||||
|
# due to alignment padding the size might not by exact
|
||||||
|
# but the difference in size for both types is significant,
|
||||||
|
# so we can just use whichever is closest
|
||||||
|
remaining = len(data) - offset
|
||||||
|
if abs(remaining - size_type_0) < abs(remaining - size_type_1):
|
||||||
|
self.type = 0
|
||||||
|
else:
|
||||||
|
self.type = 1
|
||||||
|
|
||||||
|
if self.type == 0:
|
||||||
|
offset = self.adam_m.load(data, offset)
|
||||||
|
offset = self.adam_v.load(data, offset)
|
||||||
|
offset = self.adam_pf.load(data,offset)
|
||||||
|
|
||||||
|
self.adam_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.adam_fx_prev = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.adam_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
|
||||||
|
elif self.type == 1:
|
||||||
|
offset = self.lbfgs_x.load(data, offset)
|
||||||
|
offset = self.lbfgs_xp.load(data, offset)
|
||||||
|
offset = self.lbfgs_g.load(data, offset)
|
||||||
|
offset = self.lbfgs_gp.load(data, offset)
|
||||||
|
offset = self.lbfgs_d.load(data, offset)
|
||||||
|
offset = self.lbfgs_pf.load(data, offset)
|
||||||
|
offset = self.lbfgs_lmal.load(data, offset)
|
||||||
|
offset = self.lbfgs_lmys.load(data, offset)
|
||||||
|
offset = self.lbfgs_lms.load(data, offset)
|
||||||
|
offset = self.lbfgs_lmy.load(data, offset)
|
||||||
|
|
||||||
|
self.lbfgs_fx_best = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.lbfgs_step = struct.unpack('<f', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.lbfgs_j = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.lbfgs_k = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.lbfgs_end = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.lbfgs_n_no_improvement = struct.unpack('<i', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid optimizer type '{self.type}'")
|
||||||
|
|
||||||
|
return offset
|
||||||
|
|
||||||
|
def save_gguf(self, gguf_writer):
|
||||||
|
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_FILE_VERSION, 0)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, self.past)
|
||||||
|
gguf_writer.add_uint64(LLM_KV_OPTIMIZER_PARAMETER_COUNT, self.nx)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ITERATION_COUNT, self.iter)
|
||||||
|
gguf_writer.add_bool(LLM_KV_OPTIMIZER_JUST_INITIALIZED, self.just_initialized)
|
||||||
|
|
||||||
|
if self.type == 0:
|
||||||
|
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_ADAM)
|
||||||
|
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_BEST_LOSS, self.adam_fx_best)
|
||||||
|
gguf_writer.add_float32(LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS, self.adam_fx_prev)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT, self.adam_n_no_improvement)
|
||||||
|
|
||||||
|
self.adam_m.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS)
|
||||||
|
self.adam_v.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS)
|
||||||
|
if self.past > 0:
|
||||||
|
self.adam_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES)
|
||||||
|
|
||||||
|
elif self.type == 1:
|
||||||
|
gguf_writer.add_string(LLM_KV_OPTIMIZER_TYPE, LLM_KV_OPTIMIZER_TYPE_LBFGS)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT, self.lbfgs_m)
|
||||||
|
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS, self.lbfgs_fx_best)
|
||||||
|
gguf_writer.add_float32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP, self.lbfgs_step)
|
||||||
|
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J, self.lbfgs_j)
|
||||||
|
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K, self.lbfgs_k)
|
||||||
|
gguf_writer.add_int32(LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END, self.lbfgs_end)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT, self.lbfgs_n_no_improvement)
|
||||||
|
|
||||||
|
self.lbfgs_x.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS)
|
||||||
|
self.lbfgs_xp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS)
|
||||||
|
self.lbfgs_g.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS)
|
||||||
|
self.lbfgs_gp.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS)
|
||||||
|
self.lbfgs_d.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION)
|
||||||
|
if self.past > 0:
|
||||||
|
self.lbfgs_pf.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES)
|
||||||
|
self.lbfgs_lmal.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA)
|
||||||
|
self.lbfgs_lmys.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS)
|
||||||
|
self.lbfgs_lms.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S)
|
||||||
|
self.lbfgs_lmy.save_gguf(gguf_writer, name=LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y)
|
||||||
|
else:
|
||||||
|
raise ValueError('Unknown optimizer type')
|
||||||
|
|
||||||
|
class LoraParams:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def load(self, data, offset):
|
||||||
|
self.n_rank_attention_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_rank_wq = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_rank_wk = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_rank_wv = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_rank_wo = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_rank_ffn_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_rank_w1 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_rank_w2 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_rank_w3 = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_rank_tok_embeddings = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_rank_norm = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_rank_output = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
return offset
|
||||||
|
|
||||||
|
def save_gguf(self, gguf_writer):
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_TOKEN_EMBD, self.n_rank_tok_embeddings)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT_NORM, self.n_rank_norm)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_OUTPUT, self.n_rank_output)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_NORM, self.n_rank_attention_norm)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_Q, self.n_rank_wq)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_K, self.n_rank_wk)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_V, self.n_rank_wv)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_ATTN_OUT, self.n_rank_wo)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_NORM, self.n_rank_ffn_norm)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_GATE, self.n_rank_w1)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_DOWN, self.n_rank_w2)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_LORA_RANK_FFN_UP, self.n_rank_w3)
|
||||||
|
|
||||||
|
class ModelParams:
|
||||||
|
def __init__(self, n_ff = None):
|
||||||
|
self.n_ff = n_ff
|
||||||
|
|
||||||
|
def load(self, data, offset):
|
||||||
|
self.n_vocab = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_embd = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_mult = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_head = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_layer = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.n_rot = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
return offset
|
||||||
|
|
||||||
|
def get_n_ff(self):
|
||||||
|
if self.n_ff is None:
|
||||||
|
# struct my_llama_model::get_n_ff in train-text-from-scratch.cpp commit 3b5515bbe0e2224425986ba24f1f5d84aa38dce9
|
||||||
|
return ((2*(4*self.n_embd)//3 + self.n_mult - 1)//self.n_mult)*self.n_mult
|
||||||
|
else:
|
||||||
|
return self.n_ff
|
||||||
|
|
||||||
|
def save_gguf(self, gguf_writer):
|
||||||
|
# self.n_vocab not saved
|
||||||
|
gguf_writer.add_embedding_length(self.n_embd)
|
||||||
|
gguf_writer.add_head_count(self.n_head)
|
||||||
|
gguf_writer.add_block_count(self.n_layer)
|
||||||
|
gguf_writer.add_rope_dimension_count(self.n_rot)
|
||||||
|
gguf_writer.add_feed_forward_length(self.get_n_ff())
|
||||||
|
|
||||||
|
def tensor_name(key, bid=None, suffix=".weight"):
|
||||||
|
return gguf.MODEL_TENSOR_NAMES[gguf.MODEL_ARCH.LLAMA][key].format(bid=bid) + suffix
|
||||||
|
|
||||||
|
class Layer:
|
||||||
|
def __init__(self, params, lora_params, bid):
|
||||||
|
self.bid = bid
|
||||||
|
self.att_norm_a = Tensor('f', [lora_params.n_rank_attention_norm, params.n_embd])
|
||||||
|
self.att_norm_b = Tensor('f', [lora_params.n_rank_attention_norm, 1])
|
||||||
|
self.wq_a = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
|
||||||
|
self.wq_b = Tensor('f', [lora_params.n_rank_wq, params.n_embd])
|
||||||
|
self.wk_a = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
|
||||||
|
self.wk_b = Tensor('f', [lora_params.n_rank_wk, params.n_embd])
|
||||||
|
self.wv_a = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
|
||||||
|
self.wv_b = Tensor('f', [lora_params.n_rank_wv, params.n_embd])
|
||||||
|
self.wo_a = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
|
||||||
|
self.wo_b = Tensor('f', [lora_params.n_rank_wo, params.n_embd])
|
||||||
|
self.ffn_norm_a = Tensor('f', [lora_params.n_rank_ffn_norm, params.n_embd])
|
||||||
|
self.ffn_norm_b = Tensor('f', [lora_params.n_rank_ffn_norm, 1])
|
||||||
|
self.w1_a = Tensor('f', [lora_params.n_rank_w1, params.n_embd])
|
||||||
|
self.w1_b = Tensor('f', [lora_params.n_rank_w1, params.get_n_ff()])
|
||||||
|
self.w2_a = Tensor('f', [lora_params.n_rank_w2, params.get_n_ff()])
|
||||||
|
self.w2_b = Tensor('f', [lora_params.n_rank_w2, params.n_embd])
|
||||||
|
self.w3_a = Tensor('f', [lora_params.n_rank_w3, params.n_embd])
|
||||||
|
self.w3_b = Tensor('f', [lora_params.n_rank_w3, params.get_n_ff()])
|
||||||
|
|
||||||
|
def load(self, data, offset):
|
||||||
|
offset = self.att_norm_a.load(data, offset)
|
||||||
|
offset = self.att_norm_b.load(data, offset)
|
||||||
|
offset = self.wq_a.load(data, offset)
|
||||||
|
offset = self.wq_b.load(data, offset)
|
||||||
|
offset = self.wk_a.load(data, offset)
|
||||||
|
offset = self.wk_b.load(data, offset)
|
||||||
|
offset = self.wv_a.load(data, offset)
|
||||||
|
offset = self.wv_b.load(data, offset)
|
||||||
|
offset = self.wo_a.load(data, offset)
|
||||||
|
offset = self.wo_b.load(data, offset)
|
||||||
|
offset = self.ffn_norm_a.load(data, offset)
|
||||||
|
offset = self.ffn_norm_b.load(data, offset)
|
||||||
|
offset = self.w1_a.load(data, offset)
|
||||||
|
offset = self.w1_b.load(data, offset)
|
||||||
|
offset = self.w2_a.load(data, offset)
|
||||||
|
offset = self.w2_b.load(data, offset)
|
||||||
|
offset = self.w3_a.load(data, offset)
|
||||||
|
offset = self.w3_b.load(data, offset)
|
||||||
|
return offset
|
||||||
|
|
||||||
|
def save_gguf(self, gguf_writer):
|
||||||
|
self.att_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_a"))
|
||||||
|
self.att_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, self.bid, ".weight.lora_b"))
|
||||||
|
self.wq_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_a"))
|
||||||
|
self.wq_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_Q, self.bid, ".weight.lora_b"))
|
||||||
|
self.wk_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_a"))
|
||||||
|
self.wk_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_K, self.bid, ".weight.lora_b"))
|
||||||
|
self.wv_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_a"))
|
||||||
|
self.wv_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_V, self.bid, ".weight.lora_b"))
|
||||||
|
self.wo_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_a"))
|
||||||
|
self.wo_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, self.bid, ".weight.lora_b"))
|
||||||
|
self.ffn_norm_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_a"))
|
||||||
|
self.ffn_norm_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_NORM, self.bid, ".weight.lora_b"))
|
||||||
|
self.w1_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_a"))
|
||||||
|
self.w1_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_GATE, self.bid, ".weight.lora_b"))
|
||||||
|
self.w2_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_a"))
|
||||||
|
self.w2_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, self.bid, ".weight.lora_b"))
|
||||||
|
self.w3_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_a"))
|
||||||
|
self.w3_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.FFN_UP, self.bid, ".weight.lora_b"))
|
||||||
|
|
||||||
|
class LoraModel:
|
||||||
|
def __init__(self, n_ff = None):
|
||||||
|
self.params = ModelParams(n_ff = n_ff)
|
||||||
|
self.lora_params = LoraParams()
|
||||||
|
self.layers = []
|
||||||
|
|
||||||
|
def load(self, data, offset):
|
||||||
|
offset = self.params.load(data, offset)
|
||||||
|
offset = self.lora_params.load(data, offset)
|
||||||
|
|
||||||
|
self.tok_embd_a = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_embd])
|
||||||
|
self.tok_embd_b = Tensor('f', [self.lora_params.n_rank_tok_embeddings, self.params.n_vocab])
|
||||||
|
self.norm_a = Tensor('f', [self.lora_params.n_rank_norm, self.params.n_embd])
|
||||||
|
self.norm_b = Tensor('f', [self.lora_params.n_rank_norm, 1])
|
||||||
|
self.output_a = Tensor('f', [self.lora_params.n_rank_output, self.params.n_embd])
|
||||||
|
self.output_b = Tensor('f', [self.lora_params.n_rank_output, self.params.n_vocab])
|
||||||
|
|
||||||
|
offset = self.tok_embd_a.load(data, offset)
|
||||||
|
offset = self.tok_embd_b.load(data, offset)
|
||||||
|
offset = self.norm_a.load(data, offset)
|
||||||
|
offset = self.norm_b.load(data, offset)
|
||||||
|
offset = self.output_a.load(data, offset)
|
||||||
|
offset = self.output_b.load(data, offset)
|
||||||
|
|
||||||
|
self.layers.clear()
|
||||||
|
for bid in range(self.params.n_layer):
|
||||||
|
layer = Layer(self.params, self.lora_params, bid)
|
||||||
|
offset = layer.load(data, offset)
|
||||||
|
self.layers.append(layer)
|
||||||
|
|
||||||
|
return offset
|
||||||
|
|
||||||
|
def save_gguf(self, gguf_writer):
|
||||||
|
self.params.save_gguf(gguf_writer)
|
||||||
|
self.lora_params.save_gguf(gguf_writer)
|
||||||
|
|
||||||
|
self.tok_embd_a.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_a"))
|
||||||
|
self.tok_embd_b.save_gguf(gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD, suffix=".weight.lora_b"))
|
||||||
|
self.norm_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_a"))
|
||||||
|
self.norm_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM, suffix=".weight.lora_b"))
|
||||||
|
self.output_a.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_a"))
|
||||||
|
self.output_b.save_gguf (gguf_writer, name=tensor_name(gguf.MODEL_TENSOR.OUTPUT, suffix=".weight.lora_b"))
|
||||||
|
|
||||||
|
for layer in self.layers:
|
||||||
|
layer.save_gguf(gguf_writer)
|
||||||
|
|
||||||
|
class LoraCheckpoint:
|
||||||
|
def __init__(self, n_ff = None):
|
||||||
|
self.model = LoraModel(n_ff = n_ff)
|
||||||
|
self.opt_ctx = OptimizationContext()
|
||||||
|
|
||||||
|
def load(self, data, offset):
|
||||||
|
magic = bytes(reversed(data[offset:offset + 4])); offset += 4
|
||||||
|
if magic != b'ggcl':
|
||||||
|
raise ValueError(f"File header magic indicates, that this is no finetune-lora checkpoint file. Expected 'ggcl', Got '{str(magic)}'")
|
||||||
|
|
||||||
|
self.version = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
if self.version != 0:
|
||||||
|
raise ValueError('Invalid version of checkpoint file')
|
||||||
|
|
||||||
|
self.train_its = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.train_samples = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
self.train_tokens = struct.unpack('<I', bytes(data[offset:offset + 4]))[0]; offset += 4
|
||||||
|
|
||||||
|
offset = self.model.load(data, offset)
|
||||||
|
offset = self.opt_ctx.load(data, offset)
|
||||||
|
|
||||||
|
return offset
|
||||||
|
|
||||||
|
def save_gguf(self, gguf_writer):
|
||||||
|
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
||||||
|
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
||||||
|
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_FINETUNE_LORA)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
|
||||||
|
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
|
||||||
|
self.model.save_gguf(gguf_writer)
|
||||||
|
self.opt_ctx.save_gguf(gguf_writer)
|
||||||
|
|
||||||
|
def handle_args():
|
||||||
|
parser = argparse.ArgumentParser(description = 'Convert finetune checkpoints to GGUF')
|
||||||
|
parser.add_argument('--input', '-i', type = Path, help = 'Input finetune checkpoint filename', required=True)
|
||||||
|
parser.add_argument('--output', '-o', type = Path, help = 'Output GGUF filename', required=True)
|
||||||
|
parser.add_argument('--ff', type = int, help = "Feedforward size, if not provided compute from n_mult. Provide this if you get 'ValueError: Tensor.load: Expected number of elements does not match what is read from file'", required=False)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
cfg = handle_args()
|
||||||
|
print(cfg)
|
||||||
|
data = np.memmap(cfg.input, mode = 'r')
|
||||||
|
chk = LoraCheckpoint(n_ff = cfg.ff)
|
||||||
|
offset = 0
|
||||||
|
offset = chk.load(data, offset)
|
||||||
|
# we should have read all available data
|
||||||
|
assert(offset == len(data))
|
||||||
|
|
||||||
|
gguf_writer = gguf.GGUFWriter(cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], use_temp_file = False)
|
||||||
|
chk.save_gguf(gguf_writer)
|
||||||
|
print(" gguf: write header")
|
||||||
|
gguf_writer.write_header_to_file()
|
||||||
|
print(" gguf: write metadata")
|
||||||
|
gguf_writer.write_kv_data_to_file()
|
||||||
|
print(" gguf: write tensors")
|
||||||
|
gguf_writer.write_tensors_to_file()
|
||||||
|
gguf_writer.close()
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
1937
examples/finetune/finetune.cpp
Normal file
1937
examples/finetune/finetune.cpp
Normal file
File diff suppressed because it is too large
Load diff
|
@ -947,7 +947,23 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.lora_adapter = argv[i];
|
params.lora_adapter.push_back({argv[i], 1.0f});
|
||||||
|
params.use_mmap = false;
|
||||||
|
}
|
||||||
|
else if (arg == "--lora-scaled")
|
||||||
|
{
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
const char * lora_adapter = argv[i];
|
||||||
|
if (++i >= argc)
|
||||||
|
{
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
|
||||||
params.use_mmap = false;
|
params.use_mmap = false;
|
||||||
}
|
}
|
||||||
else if (arg == "--lora-base")
|
else if (arg == "--lora-base")
|
||||||
|
|
|
@ -10,9 +10,9 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
||||||
./bin/train-text-from-scratch \
|
./bin/train-text-from-scratch \
|
||||||
--vocab-model ../models/ggml-vocab-llama.gguf \
|
--vocab-model ../models/ggml-vocab-llama.gguf \
|
||||||
--ctx 64 --embd 256 --head 8 --layer 16 \
|
--ctx 64 --embd 256 --head 8 --layer 16 \
|
||||||
--checkpoint-in chk-shakespeare-256x16.gguf \
|
--checkpoint-in chk-shakespeare-256x16-LATEST.gguf \
|
||||||
--checkpoint-out chk-shakespeare-256x16.gguf \
|
--checkpoint-out chk-shakespeare-256x16-ITERATION.gguf \
|
||||||
--model-out ggml-shakespeare-256x16-f32.gguf \
|
--model-out ggml-shakespeare-256x16-f32-ITERATION.gguf \
|
||||||
--train-data "shakespeare.txt" \
|
--train-data "shakespeare.txt" \
|
||||||
-t 6 -b 16 --seed 1 --adam-iter 256 \
|
-t 6 -b 16 --seed 1 --adam-iter 256 \
|
||||||
--no-checkpointing
|
--no-checkpointing
|
||||||
|
@ -20,3 +20,8 @@ wget https://raw.githubusercontent.com/brunoklein99/deep-learning-notes/master/s
|
||||||
# predict
|
# predict
|
||||||
./bin/main -m ggml-shakespeare-256x16-f32.gguf
|
./bin/main -m ggml-shakespeare-256x16-f32.gguf
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Output files will be saved every N iterations (config with `--save-every N`).
|
||||||
|
The pattern "ITERATION" in the output filenames will be replaced with the iteration number and "LATEST" for the latest output.
|
||||||
|
|
||||||
|
To train GGUF models just pass them to `--checkpoint-in FN`.
|
||||||
|
|
|
@ -47,10 +47,13 @@ LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys"
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
|
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s"
|
||||||
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
|
LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y"
|
||||||
|
|
||||||
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
|
LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"
|
||||||
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
|
LLM_KV_TRAINING_TYPE_FINETUNE_LORA = "finetune_lora"
|
||||||
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
|
LLM_KV_TRAINING_TYPE = "training.type"
|
||||||
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
|
LLM_KV_TRAINING_FILE_VERSION = "training.file_version"
|
||||||
|
LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count"
|
||||||
|
LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count"
|
||||||
|
LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count"
|
||||||
|
|
||||||
class Tensor:
|
class Tensor:
|
||||||
def __init__(self, dtype='f', ne=None):
|
def __init__(self, dtype='f', ne=None):
|
||||||
|
@ -460,6 +463,7 @@ class Checkpoint:
|
||||||
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
gguf_writer.add_file_type(gguf.GGMLQuantizationType.F32)
|
||||||
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
gguf_writer.add_layer_norm_rms_eps(1e-5)
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
gguf_writer.add_uint32(LLM_KV_TRAINING_FILE_VERSION, 0)
|
||||||
|
gguf_writer.add_string(LLM_KV_TRAINING_TYPE, LLM_KV_TRAINING_TYPE_TRAIN_MODEL)
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
|
gguf_writer.add_uint32(LLM_KV_TRAINING_ITERATION_COUNT, self.train_its)
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
|
gguf_writer.add_uint32(LLM_KV_TRAINING_SAMPLE_COUNT, self.train_samples)
|
||||||
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
|
gguf_writer.add_uint32(LLM_KV_TRAINING_TOKEN_COUNT, self.train_tokens)
|
||||||
|
|
File diff suppressed because it is too large
Load diff
10
ggml-alloc.c
10
ggml-alloc.c
|
@ -77,7 +77,7 @@ struct free_block {
|
||||||
size_t size;
|
size_t size;
|
||||||
};
|
};
|
||||||
|
|
||||||
#define MAX_FREE_BLOCKS 128
|
#define MAX_FREE_BLOCKS 256
|
||||||
|
|
||||||
struct ggml_allocr {
|
struct ggml_allocr {
|
||||||
void * data;
|
void * data;
|
||||||
|
@ -187,6 +187,7 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor)
|
||||||
}
|
}
|
||||||
|
|
||||||
tensor->data = addr;
|
tensor->data = addr;
|
||||||
|
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
add_allocated_tensor(alloc, tensor);
|
add_allocated_tensor(alloc, tensor);
|
||||||
|
@ -218,7 +219,8 @@ static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tens
|
||||||
|
|
||||||
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
|
||||||
size = aligned_offset(NULL, size, alloc->alignment);
|
size = aligned_offset(NULL, size, alloc->alignment);
|
||||||
AT_PRINTF("%s: freeing %s (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, size, alloc->n_free_blocks);
|
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
|
||||||
|
AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
remove_allocated_tensor(alloc, tensor);
|
remove_allocated_tensor(alloc, tensor);
|
||||||
|
@ -631,3 +633,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n(
|
||||||
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
|
||||||
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
|
||||||
|
return alloc->max_size;
|
||||||
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@ GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
|
||||||
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
|
||||||
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
|
||||||
|
GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
83
ggml.h
83
ggml.h
|
@ -214,8 +214,8 @@
|
||||||
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
||||||
|
|
||||||
#define GGML_MAX_DIMS 4
|
#define GGML_MAX_DIMS 4
|
||||||
#define GGML_MAX_NODES 4096
|
#define GGML_MAX_NODES 16384
|
||||||
#define GGML_MAX_PARAMS 256
|
#define GGML_MAX_PARAMS 1024
|
||||||
#define GGML_MAX_CONTEXTS 64
|
#define GGML_MAX_CONTEXTS 64
|
||||||
#define GGML_MAX_SRC 6
|
#define GGML_MAX_SRC 6
|
||||||
#define GGML_MAX_NAME 64
|
#define GGML_MAX_NAME 64
|
||||||
|
@ -526,7 +526,15 @@ extern "C" {
|
||||||
// next prime after GGML_MAX_NODES
|
// next prime after GGML_MAX_NODES
|
||||||
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
// #define GGML_GRAPH_HASHTABLE_SIZE 4099
|
||||||
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
// next prime after GGML_MAX_NODES * 2 (nodes + leafs)
|
||||||
#define GGML_GRAPH_HASHTABLE_SIZE 8273
|
// #define GGML_GRAPH_HASHTABLE_SIZE 8273
|
||||||
|
// #define GGML_GRAPH_HASHTABLE_SIZE 16411
|
||||||
|
#define GGML_GRAPH_HASHTABLE_SIZE 32771
|
||||||
|
|
||||||
|
enum ggml_cgraph_eval_order {
|
||||||
|
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
||||||
|
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
||||||
|
GGML_CGRAPH_EVAL_ORDER_COUNT
|
||||||
|
};
|
||||||
|
|
||||||
// computation graph
|
// computation graph
|
||||||
struct ggml_cgraph {
|
struct ggml_cgraph {
|
||||||
|
@ -539,6 +547,8 @@ extern "C" {
|
||||||
|
|
||||||
void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
void * visited_hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
||||||
|
|
||||||
|
enum ggml_cgraph_eval_order order;
|
||||||
|
|
||||||
// performance
|
// performance
|
||||||
int perf_runs;
|
int perf_runs;
|
||||||
int64_t perf_cycles;
|
int64_t perf_cycles;
|
||||||
|
@ -686,12 +696,21 @@ extern "C" {
|
||||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||||
|
|
||||||
|
// Converts a flat index into coordinates
|
||||||
|
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
|
||||||
|
|
||||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||||
|
|
||||||
|
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||||
|
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||||
|
|
||||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||||
|
|
||||||
|
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||||
|
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||||
|
|
||||||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
@ -725,6 +744,12 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_add_cast(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
enum ggml_type type);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_add1(
|
GGML_API struct ggml_tensor * ggml_add1(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -834,6 +859,7 @@ extern "C" {
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// sums repetitions in a into shape of b
|
||||||
GGML_API struct ggml_tensor * ggml_repeat_back(
|
GGML_API struct ggml_tensor * ggml_repeat_back(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -1689,6 +1715,16 @@ extern "C" {
|
||||||
// dump the graph into a file using the dot format
|
// dump the graph into a file using the dot format
|
||||||
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
||||||
|
|
||||||
|
// build gradient checkpointing backward graph gb for gf using provided checkpoints
|
||||||
|
// gb_tmp will contain original backward graph with rewritten backward process nodes,
|
||||||
|
// but without the second forward pass nodes.
|
||||||
|
GGML_API void ggml_build_backward_gradient_checkpointing(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_cgraph * gf,
|
||||||
|
struct ggml_cgraph * gb,
|
||||||
|
struct ggml_cgraph * gb_tmp,
|
||||||
|
struct ggml_tensor * * checkpoints,
|
||||||
|
int n_checkpoints);
|
||||||
//
|
//
|
||||||
// optimization
|
// optimization
|
||||||
//
|
//
|
||||||
|
@ -1723,7 +1759,7 @@ extern "C" {
|
||||||
GGML_LINESEARCH_INVALID_PARAMETERS,
|
GGML_LINESEARCH_INVALID_PARAMETERS,
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
||||||
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
// optimization parameters
|
// optimization parameters
|
||||||
|
@ -1755,6 +1791,8 @@ extern "C" {
|
||||||
bool print_forward_graph;
|
bool print_forward_graph;
|
||||||
bool print_backward_graph;
|
bool print_backward_graph;
|
||||||
|
|
||||||
|
int n_gradient_accumulation;
|
||||||
|
|
||||||
// ADAM parameters
|
// ADAM parameters
|
||||||
struct {
|
struct {
|
||||||
int n_iter;
|
int n_iter;
|
||||||
|
@ -1800,6 +1838,7 @@ extern "C" {
|
||||||
float loss_after;
|
float loss_after;
|
||||||
|
|
||||||
struct {
|
struct {
|
||||||
|
struct ggml_tensor * g; // current gradient
|
||||||
struct ggml_tensor * m; // first moment
|
struct ggml_tensor * m; // first moment
|
||||||
struct ggml_tensor * v; // second moment
|
struct ggml_tensor * v; // second moment
|
||||||
struct ggml_tensor * pf; // past function values
|
struct ggml_tensor * pf; // past function values
|
||||||
|
@ -1916,26 +1955,26 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
GGML_API int gguf_get_n_kv(const struct gguf_context * ctx);
|
||||||
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
GGML_API int gguf_find_key(const struct gguf_context * ctx, const char * key);
|
||||||
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
|
GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
|
||||||
|
|
||||||
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
|
GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
|
GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
|
||||||
|
|
||||||
// results are undefined if the wrong type is used for the key
|
// will abort if the wrong type is used for the key
|
||||||
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int i);
|
GGML_API uint8_t gguf_get_val_u8 (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int i);
|
GGML_API int8_t gguf_get_val_i8 (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int i);
|
GGML_API uint16_t gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int i);
|
GGML_API int16_t gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int i);
|
GGML_API uint32_t gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int i);
|
GGML_API int32_t gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int i);
|
GGML_API float gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int i);
|
GGML_API uint64_t gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int i);
|
GGML_API int64_t gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int i);
|
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int i);
|
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
|
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int i);
|
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
||||||
|
|
||||||
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
||||||
|
|
39
llama.cpp
39
llama.cpp
|
@ -1936,20 +1936,18 @@ static void llm_load_vocab(
|
||||||
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
throw std::runtime_error("cannot find tokenizer vocab in model file\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const float * scores = nullptr;
|
||||||
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
const int score_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_SCORES).c_str());
|
||||||
if (score_idx == -1) {
|
if (score_idx != -1) {
|
||||||
throw std::runtime_error("cannot find tokenizer scores in model file\n");
|
scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const float * scores = (const float * ) gguf_get_arr_data(ctx, score_idx);
|
const int * toktypes = nullptr;
|
||||||
|
|
||||||
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
const int toktype_idx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_TOKEN_TYPE).c_str());
|
||||||
if (toktype_idx == -1) {
|
if (toktype_idx != -1) {
|
||||||
throw std::runtime_error("cannot find token type list in GGUF file\n");
|
toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx);
|
|
||||||
|
|
||||||
// determine vocab type
|
// determine vocab type
|
||||||
{
|
{
|
||||||
std::string tokenizer_name;
|
std::string tokenizer_name;
|
||||||
|
@ -2017,8 +2015,8 @@ static void llm_load_vocab(
|
||||||
|
|
||||||
auto & token_data = vocab.id_to_token[i];
|
auto & token_data = vocab.id_to_token[i];
|
||||||
token_data.text = std::move(word);
|
token_data.text = std::move(word);
|
||||||
token_data.score = scores[i];
|
token_data.score = scores ? scores[i] : 0.0f;
|
||||||
token_data.type = (llama_token_type) toktypes[i];
|
token_data.type = toktypes ? (llama_token_type) toktypes[i] : LLAMA_TOKEN_TYPE_NORMAL;
|
||||||
}
|
}
|
||||||
|
|
||||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||||
|
@ -6265,7 +6263,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
}
|
}
|
||||||
|
|
||||||
static int llama_apply_lora_from_file_internal(
|
static int llama_apply_lora_from_file_internal(
|
||||||
const struct llama_model & model, const char * path_lora, const char * path_base_model, int n_threads
|
const struct llama_model & model, const char * path_lora, float scale, const char * path_base_model, int n_threads
|
||||||
) {
|
) {
|
||||||
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
LLAMA_LOG_INFO("%s: applying lora adapter from '%s' - please wait ...\n", __func__, path_lora);
|
||||||
|
|
||||||
|
@ -6294,7 +6292,7 @@ static int llama_apply_lora_from_file_internal(
|
||||||
int32_t lora_alpha;
|
int32_t lora_alpha;
|
||||||
fin.read((char *) &lora_r, sizeof(lora_r));
|
fin.read((char *) &lora_r, sizeof(lora_r));
|
||||||
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
fin.read((char *) &lora_alpha, sizeof(lora_alpha));
|
||||||
float scaling = (float)lora_alpha / (float)lora_r;
|
float scaling = scale * (float)lora_alpha / (float)lora_r;
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
||||||
|
|
||||||
|
@ -6510,9 +6508,10 @@ static int llama_apply_lora_from_file_internal(
|
||||||
ggml_set_name(r, "r_cpy");
|
ggml_set_name(r, "r_cpy");
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
struct ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
||||||
|
ggml_build_forward_expand(gf, r);
|
||||||
|
|
||||||
ggml_graph_compute_helper(work_buffer, &gf, n_threads);
|
ggml_graph_compute_helper(work_buffer, gf, n_threads);
|
||||||
|
|
||||||
// we won't need these tensors again, reset the context to save memory
|
// we won't need these tensors again, reset the context to save memory
|
||||||
ggml_free(lora_ctx);
|
ggml_free(lora_ctx);
|
||||||
|
@ -6901,6 +6900,10 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
||||||
return nparams;
|
return nparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
||||||
|
return ggml_get_tensor(model->ctx, name);
|
||||||
|
}
|
||||||
|
|
||||||
int llama_model_quantize(
|
int llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
const char * fname_out,
|
const char * fname_out,
|
||||||
|
@ -6914,18 +6917,18 @@ int llama_model_quantize(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
||||||
try {
|
try {
|
||||||
return llama_apply_lora_from_file_internal(ctx->model, path_lora, path_base_model, n_threads);
|
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, const char * path_base_model, int n_threads) {
|
int llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int n_threads) {
|
||||||
try {
|
try {
|
||||||
return llama_apply_lora_from_file_internal(*model, path_lora, path_base_model, n_threads);
|
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
||||||
} catch (const std::exception & err) {
|
} catch (const std::exception & err) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
||||||
return 1;
|
return 1;
|
||||||
|
|
11
llama.h
11
llama.h
|
@ -291,6 +291,9 @@ extern "C" {
|
||||||
// Returns the total number of parameters in the model
|
// Returns the total number of parameters in the model
|
||||||
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Get a llama model tensor
|
||||||
|
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
|
||||||
|
|
||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
LLAMA_API int llama_model_quantize(
|
LLAMA_API int llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
|
@ -306,15 +309,17 @@ extern "C" {
|
||||||
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
const char * path_lora,
|
const char * path_lora,
|
||||||
|
float scale,
|
||||||
const char * path_base_model,
|
const char * path_base_model,
|
||||||
int n_threads),
|
int n_threads),
|
||||||
"use llama_model_apply_lora_from_file instead");
|
"use llama_model_apply_lora_from_file instead");
|
||||||
|
|
||||||
LLAMA_API int llama_model_apply_lora_from_file(
|
LLAMA_API int llama_model_apply_lora_from_file(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const char * path_lora,
|
const char * path_lora,
|
||||||
const char * path_base_model,
|
float scale,
|
||||||
int n_threads);
|
const char * path_base_model,
|
||||||
|
int n_threads);
|
||||||
|
|
||||||
//
|
//
|
||||||
// KV cache
|
// KV cache
|
||||||
|
|
|
@ -251,18 +251,20 @@ static bool check_gradient(
|
||||||
printf("GGML_N_THREADS = %d\n", n_threads);
|
printf("GGML_N_THREADS = %d\n", n_threads);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_build_forward (f);
|
struct ggml_cgraph * gf = ggml_build_forward_ctx(ctx0, f);
|
||||||
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
struct ggml_cgraph * gb = ggml_new_graph(ctx0);
|
||||||
|
*gb = *gf;
|
||||||
|
ggml_build_backward_expand(ctx0, gf, gb, false);
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
|
||||||
|
|
||||||
ggml_graph_reset (&gf);
|
ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
|
ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
|
||||||
|
|
||||||
// ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
|
// ggml_graph_dump_dot(gf, NULL, "test-grad0-forward.dot");
|
||||||
// ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot");
|
// ggml_graph_dump_dot(gb, gf, "test-grad0-backward.dot");
|
||||||
|
|
||||||
for (int i = 0; i < nargs; ++i) {
|
for (int i = 0; i < nargs; ++i) {
|
||||||
const int nelements = ggml_nelements(x[i]);
|
const int nelements = ggml_nelements(x[i]);
|
||||||
|
@ -273,13 +275,13 @@ static bool check_gradient(
|
||||||
const float xp = x0 + eps;
|
const float xp = x0 + eps;
|
||||||
ggml_set_f32_1d(x[i], k, xp);
|
ggml_set_f32_1d(x[i], k, xp);
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
|
||||||
|
|
||||||
const double f0 = ggml_get_f32_1d(f, 0);
|
const double f0 = ggml_get_f32_1d(f, 0);
|
||||||
|
|
||||||
ggml_set_f32_1d(x[i], k, xm);
|
ggml_set_f32_1d(x[i], k, xm);
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
|
||||||
|
|
||||||
const double f1 = ggml_get_f32_1d(f, 0);
|
const double f1 = ggml_get_f32_1d(f, 0);
|
||||||
const double g0 = (f0 - f1)/(2.0*(double) eps);
|
const double g0 = (f0 - f1)/(2.0*(double) eps);
|
||||||
|
@ -287,10 +289,10 @@ static bool check_gradient(
|
||||||
ggml_set_f32_1d(x[i], k, x0);
|
ggml_set_f32_1d(x[i], k, x0);
|
||||||
|
|
||||||
// compute gradient using backward graph
|
// compute gradient using backward graph
|
||||||
ggml_graph_reset (&gf);
|
ggml_graph_reset (gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
|
|
||||||
ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
|
ggml_graph_compute_with_ctx(ctx0, gb, n_threads);
|
||||||
|
|
||||||
const double g1 = ggml_get_f32_1d(x[i]->grad, k);
|
const double g1 = ggml_get_f32_1d(x[i]->grad, k);
|
||||||
|
|
||||||
|
@ -373,7 +375,7 @@ static bool check_mat_mul(
|
||||||
|
|
||||||
int main(int argc, const char ** argv) {
|
int main(int argc, const char ** argv) {
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/* .mem_size = */ 128*1024*1024,
|
/* .mem_size = */ 256*1024*1024,
|
||||||
/* .mem_buffer = */ NULL,
|
/* .mem_buffer = */ NULL,
|
||||||
/* .no_alloc = */ false,
|
/* .no_alloc = */ false,
|
||||||
};
|
};
|
||||||
|
@ -405,6 +407,7 @@ int main(int argc, const char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned seed_iter = 1;
|
||||||
|
|
||||||
// original loop: 1000
|
// original loop: 1000
|
||||||
int niter = 4;
|
int niter = 4;
|
||||||
|
@ -416,6 +419,10 @@ int main(int argc, const char ** argv) {
|
||||||
niter = atoi(argv[1]);
|
niter = atoi(argv[1]);
|
||||||
}
|
}
|
||||||
for (int iter = 0; iter < niter; ++iter) {
|
for (int iter = 0; iter < niter; ++iter) {
|
||||||
|
srand(seed_iter);
|
||||||
|
seed_iter = rand();
|
||||||
|
unsigned seed = rand();
|
||||||
|
|
||||||
printf("test-grad0: iter:%d/%d\n", iter, niter);
|
printf("test-grad0: iter:%d/%d\n", iter, niter);
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
|
@ -425,6 +432,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// add f32
|
// add f32
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 2;
|
const int nargs = 2;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -441,6 +449,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// add f16
|
// add f16
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 2;
|
const int nargs = 2;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -457,6 +466,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// sub
|
// sub
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 2;
|
const int nargs = 2;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -473,6 +483,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// mul
|
// mul
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 2;
|
const int nargs = 2;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -489,6 +500,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// div
|
// div
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 2;
|
const int nargs = 2;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -505,6 +517,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// sqr
|
// sqr
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 2; ++ndims) {
|
for (int ndims = 1; ndims <= 2; ++ndims) {
|
||||||
|
@ -521,6 +534,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// sqrt
|
// sqrt
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 2; ++ndims) {
|
for (int ndims = 1; ndims <= 2; ++ndims) {
|
||||||
|
@ -537,6 +551,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// log
|
// log
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 2; ++ndims) {
|
for (int ndims = 1; ndims <= 2; ++ndims) {
|
||||||
|
@ -553,6 +568,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// sum
|
// sum
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 2; ++ndims) {
|
for (int ndims = 1; ndims <= 2; ++ndims) {
|
||||||
|
@ -570,6 +586,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// sum_rows
|
// sum_rows
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -587,6 +604,7 @@ int main(int argc, const char ** argv) {
|
||||||
// mean, not yet fully implemented
|
// mean, not yet fully implemented
|
||||||
if(0)
|
if(0)
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -604,6 +622,7 @@ int main(int argc, const char ** argv) {
|
||||||
// argmax
|
// argmax
|
||||||
if (0)
|
if (0)
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -620,6 +639,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// repeat
|
// repeat
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
get_random_dims(ne2, 4);
|
get_random_dims(ne2, 4);
|
||||||
|
|
||||||
|
@ -642,6 +662,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// repeat back
|
// repeat back
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
get_random_dims(ne2, 4);
|
get_random_dims(ne2, 4);
|
||||||
|
|
||||||
|
@ -680,6 +701,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// sgn
|
// sgn
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -696,6 +718,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// neg
|
// neg
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -712,6 +735,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// step
|
// step
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -729,6 +753,7 @@ int main(int argc, const char ** argv) {
|
||||||
// tanh, not yet fully implemented
|
// tanh, not yet fully implemented
|
||||||
if(0)
|
if(0)
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -745,33 +770,45 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// mul_mat
|
// mul_mat
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 2;
|
const int nargs = 2;
|
||||||
|
|
||||||
for (int ndims = 2; ndims <= 2; ++ndims) {
|
for (int ndims = 2; ndims <= 4; ++ndims) {
|
||||||
|
int max_nrep = (ndims >= 3) ? 2 : 1;
|
||||||
x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
x[0] = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
|
||||||
{
|
for (int nrep2 = 1; nrep2 < max_nrep; ++nrep2) {
|
||||||
int64_t ne2[4];
|
for (int nrep3 = 1; nrep3 < max_nrep; ++nrep3) {
|
||||||
get_random_dims(ne2, 4);
|
{
|
||||||
ne2[0] = ne[0];
|
int64_t ne2[4];
|
||||||
x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
|
get_random_dims(ne2, 4);
|
||||||
|
ne2[0] = ne[0];
|
||||||
|
ne2[2] = nrep2 * ne[2];
|
||||||
|
ne2[3] = nrep3 * ne[3];
|
||||||
|
x[1] = get_random_tensor_f32(ctx0, ndims, ne2, -1.0f, 1.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_set_param(ctx0, x[0]);
|
||||||
|
ggml_set_param(ctx0, x[1]);
|
||||||
|
|
||||||
|
struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
|
||||||
|
struct ggml_tensor * f = ggml_sum(ctx0, m);
|
||||||
|
|
||||||
|
GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
|
||||||
|
|
||||||
|
check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
||||||
|
if (ndims == 2) {
|
||||||
|
// check_mat_mul does not support ndims > 2
|
||||||
|
check_mat_mul(m, x[1], x[0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_set_param(ctx0, x[0]);
|
|
||||||
ggml_set_param(ctx0, x[1]);
|
|
||||||
|
|
||||||
struct ggml_tensor * m = ggml_mul_mat(ctx0, x[1], x[0]);
|
|
||||||
struct ggml_tensor * f = ggml_sum(ctx0, m);
|
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("testing: mul_mat, [%lld, %lld] (%d) * [%lld, %lld] (%d)\n", x[1]->ne[0], x[1]->ne[1], x[1]->n_dims, x[0]->ne[0], x[0]->ne[1], x[0]->n_dims);
|
|
||||||
|
|
||||||
check_gradient("mul_mat", ctx0, x, f, ndims, nargs, 1e-3f, 1e-3f, INFINITY);
|
|
||||||
check_mat_mul(m, x[1], x[0]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// elu, not yet fully implemented
|
// elu, not yet fully implemented
|
||||||
if(0)
|
if(0)
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -788,6 +825,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// relu
|
// relu
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -805,6 +843,7 @@ int main(int argc, const char ** argv) {
|
||||||
// gelu, not yet fully implemented
|
// gelu, not yet fully implemented
|
||||||
if(0)
|
if(0)
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
@ -821,6 +860,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// silu
|
// silu
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 2; ++ndims) {
|
for (int ndims = 1; ndims <= 2; ++ndims) {
|
||||||
|
@ -842,6 +882,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// rms_norm
|
// rms_norm
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 2; ++ndims) {
|
for (int ndims = 1; ndims <= 2; ++ndims) {
|
||||||
|
@ -858,6 +899,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// scale
|
// scale
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 2;
|
const int nargs = 2;
|
||||||
|
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
|
@ -878,6 +920,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// cpy f32
|
// cpy f32
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 2;
|
const int nargs = 2;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 2; ++ndims) {
|
for (int ndims = 1; ndims <= 2; ++ndims) {
|
||||||
|
@ -895,6 +938,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// cpy f16
|
// cpy f16
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 2;
|
const int nargs = 2;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 2; ++ndims) {
|
for (int ndims = 1; ndims <= 2; ++ndims) {
|
||||||
|
@ -912,6 +956,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// reshape (1d->nd)
|
// reshape (1d->nd)
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 2; ++ndims) {
|
for (int ndims = 1; ndims <= 2; ++ndims) {
|
||||||
|
@ -935,6 +980,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// reshape (nd->1d)
|
// reshape (nd->1d)
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
for (int ndims = 1; ndims <= 2; ++ndims) {
|
for (int ndims = 1; ndims <= 2; ++ndims) {
|
||||||
|
@ -958,6 +1004,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// acc 1d
|
// acc 1d
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4] = { 1, 1, 1, 1 };
|
int64_t ne2[4] = { 1, 1, 1, 1 };
|
||||||
|
|
||||||
const int nargs = 2;
|
const int nargs = 2;
|
||||||
|
@ -985,6 +1032,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// acc 2d
|
// acc 2d
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4] = { 1, 1, 1, 1 };
|
int64_t ne2[4] = { 1, 1, 1, 1 };
|
||||||
int64_t max_offsets[4] = { 0, 0, 0, 0 };
|
int64_t max_offsets[4] = { 0, 0, 0, 0 };
|
||||||
int64_t offsets[4] = { 0, 0, 0, 0 };
|
int64_t offsets[4] = { 0, 0, 0, 0 };
|
||||||
|
@ -1017,6 +1065,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// acc 3d
|
// acc 3d
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4] = { 1, 1, 1, 1 };
|
int64_t ne2[4] = { 1, 1, 1, 1 };
|
||||||
int64_t max_offsets[4] = { 0, 0, 0, 0 };
|
int64_t max_offsets[4] = { 0, 0, 0, 0 };
|
||||||
int64_t offsets[4] = { 0, 0, 0, 0 };
|
int64_t offsets[4] = { 0, 0, 0, 0 };
|
||||||
|
@ -1051,6 +1100,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// acc 4d
|
// acc 4d
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4] = { 1, 1, 1, 1 };
|
int64_t ne2[4] = { 1, 1, 1, 1 };
|
||||||
int64_t max_offsets[4] = { 0, 0, 0, 0 };
|
int64_t max_offsets[4] = { 0, 0, 0, 0 };
|
||||||
int64_t offsets[4] = { 0, 0, 0, 0 };
|
int64_t offsets[4] = { 0, 0, 0, 0 };
|
||||||
|
@ -1087,6 +1137,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// set_1d
|
// set_1d
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
|
|
||||||
const int nargs = 2;
|
const int nargs = 2;
|
||||||
|
@ -1114,6 +1165,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// set_2d
|
// set_2d
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
int64_t max_offsets[4] = { 0, 0, 0, 0 };
|
int64_t max_offsets[4] = { 0, 0, 0, 0 };
|
||||||
int64_t offsets[4] = { 0, 0, 0, 0 };
|
int64_t offsets[4] = { 0, 0, 0, 0 };
|
||||||
|
@ -1146,6 +1198,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// view_1d
|
// view_1d
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
for (int ndims = 1; ndims <= 4; ++ndims) {
|
for (int ndims = 1; ndims <= 4; ++ndims) {
|
||||||
|
|
||||||
|
@ -1169,6 +1222,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// view_2d
|
// view_2d
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
int64_t nb2[4];
|
int64_t nb2[4];
|
||||||
|
|
||||||
|
@ -1199,6 +1253,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// view_3d
|
// view_3d
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4] = {1,1,1,1};
|
int64_t ne2[4] = {1,1,1,1};
|
||||||
int64_t nb2[4] = {0,0,0,0};
|
int64_t nb2[4] = {0,0,0,0};
|
||||||
|
|
||||||
|
@ -1230,6 +1285,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// permute
|
// permute
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
|
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
@ -1263,6 +1319,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// transpose
|
// transpose
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
|
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
@ -1290,6 +1347,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// get_rows
|
// get_rows
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
int64_t ne2[4] = {ne[0], ne[1], 1, 1};
|
int64_t ne2[4] = {ne[0], ne[1], 1, 1};
|
||||||
int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
|
int64_t ne3[4] = {1+irand(ne[1]), 1, 1, 1};
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
@ -1306,6 +1364,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// diag_mask_inf
|
// diag_mask_inf
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
const int ndims = 2;
|
const int ndims = 2;
|
||||||
|
|
||||||
|
@ -1321,6 +1380,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// diag_mask_zero
|
// diag_mask_zero
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
const int ndims = 2;
|
const int ndims = 2;
|
||||||
|
|
||||||
|
@ -1336,6 +1396,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// softmax
|
// softmax
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
|
@ -1357,11 +1418,16 @@ int main(int argc, const char ** argv) {
|
||||||
ggml_new_f32(ctx0, eps))));
|
ggml_new_f32(ctx0, eps))));
|
||||||
|
|
||||||
check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
|
check_gradient("softmax", ctx0, x, f, ndims, nargs, 1e-3f, 2e-1f, INFINITY);
|
||||||
|
// NOTE: softmax forward is computed using f16 table lookup instead of using actual expf, but backward assumes actual expf.
|
||||||
|
// this may result in different gradients too finite differences.
|
||||||
|
// when this test reports errors, first try to replace the table lookup with actual expf and test again to see if just that was the cause.
|
||||||
|
// if only the table lookup causes gradients to differ this is acceptable.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// cross_entropy_loss
|
// cross_entropy_loss
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
|
@ -1392,6 +1458,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// rope f32
|
// rope f32
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
|
@ -1431,6 +1498,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// rope f16
|
// rope f16
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 1;
|
const int nargs = 1;
|
||||||
|
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
|
@ -1470,6 +1538,7 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
// flash_attn f32
|
// flash_attn f32
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 3;
|
const int nargs = 3;
|
||||||
|
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
|
@ -1482,28 +1551,31 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
for (int masked = 0; masked <= 1; ++masked) {
|
for (int masked = 0; masked <= 1; ++masked) {
|
||||||
for (int ndims = 2; ndims <= 4; ++ndims) {
|
for (int ndims = 2; ndims <= 4; ++ndims) {
|
||||||
int64_t neq[4] = { D, N, B, ne[3] };
|
int max_nrep = (ndims >= 3) ? 2 : 1;
|
||||||
int64_t nek[4] = { D, M, B, ne[3] };
|
for (int nrep = 1; nrep < max_nrep; ++nrep) {
|
||||||
int64_t nev[4] = { M, D, B, ne[3] };
|
int64_t neq[4] = { D, N, B*nrep, ne[3] };
|
||||||
if (ndims == 2) {
|
int64_t nek[4] = { D, M, B, ne[3] };
|
||||||
neq[2] = 1; neq[3] = 1;
|
int64_t nev[4] = { M, D, B, ne[3] };
|
||||||
nek[2] = 1; nek[3] = 1;
|
if (ndims == 2) {
|
||||||
nev[2] = 1; nev[3] = 1;
|
neq[2] = 1; neq[3] = 1;
|
||||||
} else if (ndims == 3) {
|
nek[2] = 1; nek[3] = 1;
|
||||||
neq[3] = 1;
|
nev[2] = 1; nev[3] = 1;
|
||||||
nek[3] = 1;
|
} else if (ndims == 3) {
|
||||||
nev[3] = 1;
|
neq[3] = 1;
|
||||||
|
nek[3] = 1;
|
||||||
|
nev[3] = 1;
|
||||||
|
}
|
||||||
|
x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
|
||||||
|
x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
|
||||||
|
x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
|
||||||
|
ggml_set_param(ctx0, x[0]);
|
||||||
|
ggml_set_param(ctx0, x[1]);
|
||||||
|
ggml_set_param(ctx0, x[2]);
|
||||||
|
|
||||||
|
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
||||||
|
|
||||||
|
check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
||||||
}
|
}
|
||||||
x[0] = get_random_tensor_f32(ctx0, ndims, neq, -0.1250f, 0.1250f);
|
|
||||||
x[1] = get_random_tensor_f32(ctx0, ndims, nek, -0.1250f, 0.1250f);
|
|
||||||
x[2] = get_random_tensor_f32(ctx0, ndims, nev, -0.1250f, 0.1250f);
|
|
||||||
ggml_set_param(ctx0, x[0]);
|
|
||||||
ggml_set_param(ctx0, x[1]);
|
|
||||||
ggml_set_param(ctx0, x[2]);
|
|
||||||
|
|
||||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_flash_attn(ctx0, x[0], x[1], x[2], (masked == 0)));
|
|
||||||
|
|
||||||
check_gradient("flash_attn f32", ctx0, x, f, ndims, nargs, 1.5e-4f, 1e-3f, INFINITY);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1511,6 +1583,7 @@ int main(int argc, const char ** argv) {
|
||||||
// flash_attn f16, not yet fully implemented
|
// flash_attn f16, not yet fully implemented
|
||||||
if(0)
|
if(0)
|
||||||
{
|
{
|
||||||
|
srand(seed);
|
||||||
const int nargs = 3;
|
const int nargs = 3;
|
||||||
|
|
||||||
int64_t ne2[4];
|
int64_t ne2[4];
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue