Merge branch 'master' into concedo
# Conflicts: # .github/workflows/build.yml # README.md
This commit is contained in:
commit
235daf4016
11 changed files with 821 additions and 656 deletions
4
Makefile
4
Makefile
|
@ -128,9 +128,9 @@ ifdef LLAMA_CUBLAS
|
|||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64
|
||||
OBJS += ggml-cuda.o
|
||||
NVCC = nvcc
|
||||
NVCCFLAGS = --forward-unknown-to-host-linker -arch=native
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -c $< -o $@
|
||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||
endif
|
||||
ifdef LLAMA_GPROF
|
||||
CFLAGS += -pg
|
||||
|
|
|
@ -34,4 +34,5 @@ else()
|
|||
add_subdirectory(quantize-stats)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(embedding)
|
||||
add_subdirectory(save-load-state)
|
||||
endif()
|
||||
|
|
|
@ -156,10 +156,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
|||
params.interactive = true;
|
||||
} else if (arg == "--embedding") {
|
||||
params.embedding = true;
|
||||
} else if (arg == "--interactive-start") {
|
||||
params.interactive = true;
|
||||
} else if (arg == "--interactive-first") {
|
||||
params.interactive_start = true;
|
||||
params.interactive_first = true;
|
||||
} else if (arg == "-ins" || arg == "--instruct") {
|
||||
params.instruct = true;
|
||||
} else if (arg == "--color") {
|
||||
|
|
|
@ -43,7 +43,7 @@ struct gpt_params {
|
|||
bool interactive = false; // interactive mode
|
||||
|
||||
bool embedding = false; // get only sentence embedding
|
||||
bool interactive_start = false; // wait for user input immediately
|
||||
bool interactive_first = false; // wait for user input immediately
|
||||
|
||||
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||
bool ignore_eos = false; // do not stop generating after eos
|
||||
|
|
|
@ -21,12 +21,20 @@ To get started right away, run the following command, making sure to use the cor
|
|||
./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
|
||||
```
|
||||
|
||||
The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
|
||||
```
|
||||
|
||||
For an interactive experience, try this command:
|
||||
|
||||
```bash
|
||||
./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
|
||||
```
|
||||
|
||||
Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
|
||||
|
||||
## Common Options
|
||||
|
||||
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
||||
|
@ -84,6 +92,8 @@ Instruction mode is particularly useful when working with Alpaca models, which a
|
|||
|
||||
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
|
||||
|
||||
Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
|
||||
|
||||
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
|
||||
|
||||
## Context Management
|
||||
|
@ -114,7 +124,7 @@ The following options are related to controlling the text generation process, in
|
|||
|
||||
The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
|
||||
|
||||
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value.
|
||||
It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
|
||||
|
||||
### RNG Seed
|
||||
|
||||
|
@ -126,7 +136,7 @@ The RNG seed is used to initialize the random number generator that influences t
|
|||
|
||||
- `--temp N`: Adjust the randomness of the generated text (default: 0.8).
|
||||
|
||||
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism.
|
||||
Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.
|
||||
|
||||
Example usage: `--temp 0.8`
|
||||
|
||||
|
|
|
@ -178,12 +178,12 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||
if (params.instruct) {
|
||||
params.interactive_start = true;
|
||||
params.interactive_first = true;
|
||||
params.antiprompt.push_back("### Instruction:\n\n");
|
||||
}
|
||||
|
||||
// enable interactive mode if reverse prompt or interactive start is specified
|
||||
if (params.antiprompt.size() != 0 || params.interactive_start) {
|
||||
if (params.antiprompt.size() != 0 || params.interactive_first) {
|
||||
params.interactive = true;
|
||||
}
|
||||
|
||||
|
@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
|
|||
#endif
|
||||
" - Press Return to return control to LLaMa.\n"
|
||||
" - If you want to submit another line, end your input in '\\'.\n\n");
|
||||
is_interacting = params.interactive_start;
|
||||
is_interacting = params.interactive_first;
|
||||
}
|
||||
|
||||
bool is_antiprompt = false;
|
||||
|
|
4
examples/save-load-state/CMakeLists.txt
Normal file
4
examples/save-load-state/CMakeLists.txt
Normal file
|
@ -0,0 +1,4 @@
|
|||
set(TARGET save-load-state)
|
||||
add_executable(${TARGET} save-load-state.cpp)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
128
examples/save-load-state/save-load-state.cpp
Normal file
128
examples/save-load-state/save-load-state.cpp
Normal file
|
@ -0,0 +1,128 @@
|
|||
#include <vector>
|
||||
#include <cstdio>
|
||||
#include <chrono>
|
||||
|
||||
#include "common.h"
|
||||
#include "llama.h"
|
||||
#include "llama.cpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
gpt_params params;
|
||||
params.model = "models/llama-7B/ggml-model.bin";
|
||||
params.seed = 42;
|
||||
params.n_threads = 4;
|
||||
params.repeat_last_n = 64;
|
||||
params.prompt = "The quick brown fox";
|
||||
|
||||
if (gpt_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto lparams = llama_context_default_params();
|
||||
|
||||
lparams.n_ctx = params.n_ctx;
|
||||
lparams.n_parts = params.n_parts;
|
||||
lparams.seed = params.seed;
|
||||
lparams.f16_kv = params.memory_f16;
|
||||
lparams.use_mmap = params.use_mmap;
|
||||
lparams.use_mlock = params.use_mlock;
|
||||
|
||||
auto n_past = 0;
|
||||
auto last_n_tokens_data = vector<llama_token>(params.repeat_last_n, 0);
|
||||
|
||||
// init
|
||||
auto ctx = llama_init_from_file(params.model.c_str(), lparams);
|
||||
auto tokens = vector<llama_token>(params.n_ctx);
|
||||
auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true);
|
||||
|
||||
if (n_prompt_tokens < 1) {
|
||||
fprintf(stderr, "%s : failed to tokenize prompt\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// evaluate prompt
|
||||
|
||||
llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads);
|
||||
|
||||
last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens);
|
||||
n_past += n_prompt_tokens;
|
||||
|
||||
// Save state (rng, logits, embedding and kv_cache) to file
|
||||
FILE *fp_write = fopen("dump_state.bin", "wb");
|
||||
auto state_size = llama_get_state_size(ctx);
|
||||
auto state_mem = new uint8_t[state_size];
|
||||
llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file
|
||||
fwrite(state_mem, 1, state_size, fp_write);
|
||||
fclose(fp_write);
|
||||
|
||||
// save state (last tokens)
|
||||
auto last_n_tokens_data_saved = vector<llama_token>(last_n_tokens_data);
|
||||
auto n_past_saved = n_past;
|
||||
|
||||
// first run
|
||||
printf("\n%s", params.prompt.c_str());
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sample_top_p_top_k(
|
||||
ctx,
|
||||
&last_n_tokens_data.back() - params.repeat_last_n,
|
||||
params.repeat_last_n,
|
||||
40,
|
||||
1.0,
|
||||
1.0,
|
||||
1.1);
|
||||
auto next_token_str = llama_token_to_str(ctx, next_token);
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
printf("%s", next_token_str);
|
||||
if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
printf("\n\n");
|
||||
|
||||
// free old model
|
||||
llama_free(ctx);
|
||||
|
||||
// load new model
|
||||
|
||||
auto ctx2 = llama_init_from_file(params.model.c_str(), lparams);
|
||||
|
||||
// Load state (rng, logits, embedding and kv_cache) from file
|
||||
FILE *fp_read = fopen("dump_state.bin", "rb");
|
||||
auto state_size2 = llama_get_state_size(ctx2);
|
||||
if (state_size != state_size2) {
|
||||
fprintf(stderr, "\n%s : failed to validate state size\n", __func__);
|
||||
}
|
||||
fread(state_mem, 1, state_size, fp_read);
|
||||
llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file
|
||||
fclose(fp_read);
|
||||
|
||||
// restore state (last tokens)
|
||||
last_n_tokens_data = last_n_tokens_data_saved;
|
||||
n_past = n_past_saved;
|
||||
|
||||
// second run
|
||||
for (auto i = 0; i < params.n_predict; i++) {
|
||||
auto next_token = llama_sample_top_p_top_k(
|
||||
ctx2,
|
||||
&last_n_tokens_data.back() - params.repeat_last_n,
|
||||
params.repeat_last_n,
|
||||
40,
|
||||
1.0,
|
||||
1.0,
|
||||
1.1);
|
||||
auto next_token_str = llama_token_to_str(ctx2, next_token);
|
||||
last_n_tokens_data.push_back(next_token);
|
||||
printf("%s", next_token_str);
|
||||
if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) {
|
||||
fprintf(stderr, "\n%s : failed to evaluate\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
n_past += 1;
|
||||
}
|
||||
printf("\n\n");
|
||||
return 0;
|
||||
}
|
9
ggml.c
9
ggml.c
|
@ -438,7 +438,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|||
static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
|
||||
{
|
||||
// Load 8 bytes from memory
|
||||
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
|
||||
__m128i tmp = _mm_loadl_epi64( ( const __m128i* )rsi );
|
||||
|
||||
// Expand bytes into uint16_t values
|
||||
__m128i bytes = _mm_cvtepu8_epi16( tmp );
|
||||
|
@ -6781,15 +6781,20 @@ static void ggml_compute_forward_sum_f32(
|
|||
const size_t nb02 = src0->nb[2];
|
||||
const size_t nb03 = src0->nb[3];
|
||||
|
||||
ggml_float sum = 0;
|
||||
float row_sum = 0;
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||
ggml_vec_sum_f32(ne00,
|
||||
(float *) (dst->data),
|
||||
&row_sum,
|
||||
(float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
|
||||
sum += row_sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
((float *) dst->data)[0] = sum;
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_sum(
|
||||
|
|
253
ggml.h
253
ggml.h
|
@ -169,14 +169,27 @@
|
|||
//
|
||||
//
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#ifdef GGML_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef GGML_BUILD
|
||||
# define GGML_API __declspec(dllexport)
|
||||
# else
|
||||
# define GGML_API __declspec(dllimport)
|
||||
# endif
|
||||
# else
|
||||
# define GGML_API __attribute__ ((visibility ("default")))
|
||||
# endif
|
||||
#else
|
||||
# define GGML_API
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
|
||||
#define GGML_FILE_VERSION 1
|
||||
|
||||
#define GGML_MAX_DIMS 4
|
||||
#define GGML_MAX_NODES 4096
|
||||
#define GGML_MAX_PARAMS 16
|
||||
|
@ -184,6 +197,10 @@ extern "C" {
|
|||
#define GGML_MAX_OPT 4
|
||||
#define GGML_DEFAULT_N_THREADS 4
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef __ARM_NEON
|
||||
// we use the built-in 16-bit float type
|
||||
typedef __fp16 ggml_fp16_t;
|
||||
|
@ -192,14 +209,13 @@ typedef uint16_t ggml_fp16_t;
|
|||
#endif
|
||||
|
||||
// convert FP16 <-> FP32
|
||||
float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||
ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||
|
||||
struct ggml_object;
|
||||
struct ggml_context;
|
||||
|
||||
enum ggml_type {
|
||||
// explicitly numbered values are used in llama.cpp files
|
||||
GGML_TYPE_F32 = 0,
|
||||
GGML_TYPE_F16 = 1,
|
||||
GGML_TYPE_Q4_0 = 2,
|
||||
|
@ -341,60 +357,64 @@ struct ggml_init_params {
|
|||
bool no_alloc; // don't allocate memory for the tensor data
|
||||
};
|
||||
|
||||
void ggml_time_init(void); // call this once at the beginning of the program
|
||||
int64_t ggml_time_ms(void);
|
||||
int64_t ggml_time_us(void);
|
||||
int64_t ggml_cycles(void);
|
||||
int64_t ggml_cycles_per_ms(void);
|
||||
// misc
|
||||
|
||||
void ggml_print_object (const struct ggml_object * obj);
|
||||
void ggml_print_objects(const struct ggml_context * ctx);
|
||||
GGML_API void ggml_time_init(void); // call this once at the beginning of the program
|
||||
GGML_API int64_t ggml_time_ms(void);
|
||||
GGML_API int64_t ggml_time_us(void);
|
||||
GGML_API int64_t ggml_cycles(void);
|
||||
GGML_API int64_t ggml_cycles_per_ms(void);
|
||||
|
||||
int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
||||
size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||
|
||||
int ggml_blck_size (enum ggml_type type);
|
||||
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
||||
GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
|
||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||
|
||||
const char * ggml_type_name(enum ggml_type type);
|
||||
GGML_API int ggml_blck_size (enum ggml_type type);
|
||||
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
||||
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
||||
|
||||
size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
GGML_API const char * ggml_type_name(enum ggml_type type);
|
||||
|
||||
bool ggml_is_quantized(enum ggml_type type);
|
||||
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||
|
||||
struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||
void ggml_free(struct ggml_context * ctx);
|
||||
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
||||
|
||||
size_t ggml_used_mem(const struct ggml_context * ctx);
|
||||
// main
|
||||
|
||||
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
||||
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
|
||||
GGML_API void ggml_free(struct ggml_context * ctx);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor(
|
||||
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
|
||||
|
||||
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_tensor(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int n_dims,
|
||||
const int64_t *ne);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_1d(
|
||||
GGML_API struct ggml_tensor * ggml_new_tensor_1d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_2d(
|
||||
GGML_API struct ggml_tensor * ggml_new_tensor_2d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0,
|
||||
int64_t ne1);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_3d(
|
||||
GGML_API struct ggml_tensor * ggml_new_tensor_3d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2);
|
||||
|
||||
struct ggml_tensor * ggml_new_tensor_4d(
|
||||
GGML_API struct ggml_tensor * ggml_new_tensor_4d(
|
||||
struct ggml_context * ctx,
|
||||
enum ggml_type type,
|
||||
int64_t ne0,
|
||||
|
@ -402,128 +422,127 @@ struct ggml_tensor * ggml_new_tensor_4d(
|
|||
int64_t ne2,
|
||||
int64_t ne3);
|
||||
|
||||
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
|
||||
struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||
struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
|
||||
|
||||
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
|
||||
int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
|
||||
float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
|
||||
void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||
float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||
|
||||
//
|
||||
// operations on tensors with backpropagation
|
||||
//
|
||||
|
||||
struct ggml_tensor * ggml_dup(
|
||||
GGML_API struct ggml_tensor * ggml_dup(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
struct ggml_tensor * ggml_add(
|
||||
GGML_API struct ggml_tensor * ggml_add(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
|
||||
struct ggml_tensor * ggml_add_inplace(
|
||||
GGML_API struct ggml_tensor * ggml_add_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
struct ggml_tensor * ggml_sub(
|
||||
GGML_API struct ggml_tensor * ggml_sub(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
struct ggml_tensor * ggml_mul(
|
||||
GGML_API struct ggml_tensor * ggml_mul(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
struct ggml_tensor * ggml_div(
|
||||
GGML_API struct ggml_tensor * ggml_div(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
struct ggml_tensor * ggml_sqr(
|
||||
GGML_API struct ggml_tensor * ggml_sqr(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
struct ggml_tensor * ggml_sqrt(
|
||||
GGML_API struct ggml_tensor * ggml_sqrt(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// return scalar
|
||||
// TODO: compute sum along rows
|
||||
struct ggml_tensor * ggml_sum(
|
||||
GGML_API struct ggml_tensor * ggml_sum(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// mean along rows
|
||||
struct ggml_tensor * ggml_mean(
|
||||
GGML_API struct ggml_tensor * ggml_mean(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// if a is the same shape as b, and a is not parameter, return a
|
||||
// otherwise, return a new tensor: repeat(a) to fit in b
|
||||
struct ggml_tensor * ggml_repeat(
|
||||
GGML_API struct ggml_tensor * ggml_repeat(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
struct ggml_tensor * ggml_abs(
|
||||
GGML_API struct ggml_tensor * ggml_abs(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
struct ggml_tensor * ggml_sgn(
|
||||
GGML_API struct ggml_tensor * ggml_sgn(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
struct ggml_tensor * ggml_neg(
|
||||
GGML_API struct ggml_tensor * ggml_neg(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
struct ggml_tensor * ggml_step(
|
||||
GGML_API struct ggml_tensor * ggml_step(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
struct ggml_tensor * ggml_relu(
|
||||
GGML_API struct ggml_tensor * ggml_relu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// TODO: double-check this computation is correct
|
||||
struct ggml_tensor * ggml_gelu(
|
||||
GGML_API struct ggml_tensor * ggml_gelu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
struct ggml_tensor * ggml_silu(
|
||||
GGML_API struct ggml_tensor * ggml_silu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// normalize along rows
|
||||
// TODO: eps is hardcoded to 1e-5 for now
|
||||
struct ggml_tensor * ggml_norm(
|
||||
GGML_API struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
struct ggml_tensor * ggml_rms_norm(
|
||||
GGML_API struct ggml_tensor * ggml_rms_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// A: m rows, n columns
|
||||
// B: p rows, n columns (i.e. we transpose it internally)
|
||||
// result is m columns, p rows
|
||||
struct ggml_tensor * ggml_mul_mat(
|
||||
GGML_API struct ggml_tensor * ggml_mul_mat(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
@ -533,32 +552,32 @@ struct ggml_tensor * ggml_mul_mat(
|
|||
//
|
||||
|
||||
// in-place, returns view(a)
|
||||
struct ggml_tensor * ggml_scale(
|
||||
GGML_API struct ggml_tensor * ggml_scale(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// a -> b, return view(b)
|
||||
struct ggml_tensor * ggml_cpy(
|
||||
GGML_API struct ggml_tensor * ggml_cpy(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// make contiguous
|
||||
struct ggml_tensor * ggml_cont(
|
||||
GGML_API struct ggml_tensor * ggml_cont(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// return view(a), b specifies the new shape
|
||||
// TODO: when we start computing gradient, make a copy instead of view
|
||||
struct ggml_tensor * ggml_reshape(
|
||||
GGML_API struct ggml_tensor * ggml_reshape(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// return view(a)
|
||||
// TODO: when we start computing gradient, make a copy instead of view
|
||||
struct ggml_tensor * ggml_reshape_2d(
|
||||
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
|
@ -566,7 +585,7 @@ struct ggml_tensor * ggml_reshape_2d(
|
|||
|
||||
// return view(a)
|
||||
// TODO: when we start computing gradient, make a copy instead of view
|
||||
struct ggml_tensor * ggml_reshape_3d(
|
||||
GGML_API struct ggml_tensor * ggml_reshape_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
|
@ -574,13 +593,13 @@ struct ggml_tensor * ggml_reshape_3d(
|
|||
int64_t ne2);
|
||||
|
||||
// offset in bytes
|
||||
struct ggml_tensor * ggml_view_1d(
|
||||
GGML_API struct ggml_tensor * ggml_view_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_view_2d(
|
||||
GGML_API struct ggml_tensor * ggml_view_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
|
@ -588,7 +607,7 @@ struct ggml_tensor * ggml_view_2d(
|
|||
size_t nb1, // row stride in bytes
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_view_3d(
|
||||
GGML_API struct ggml_tensor * ggml_view_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
|
@ -598,7 +617,7 @@ struct ggml_tensor * ggml_view_3d(
|
|||
size_t nb2, // slice stride in bytes
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_permute(
|
||||
GGML_API struct ggml_tensor * ggml_permute(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int axis0,
|
||||
|
@ -607,24 +626,24 @@ struct ggml_tensor * ggml_permute(
|
|||
int axis3);
|
||||
|
||||
// alias for ggml_permute(ctx, a, 1, 0, 2, 3)
|
||||
struct ggml_tensor * ggml_transpose(
|
||||
GGML_API struct ggml_tensor * ggml_transpose(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
struct ggml_tensor * ggml_get_rows(
|
||||
GGML_API struct ggml_tensor * ggml_get_rows(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// set elements above the diagonal to -INF
|
||||
// in-place, returns view(a)
|
||||
struct ggml_tensor * ggml_diag_mask_inf(
|
||||
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past);
|
||||
|
||||
// in-place, returns view(a)
|
||||
struct ggml_tensor * ggml_soft_max(
|
||||
GGML_API struct ggml_tensor * ggml_soft_max(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
|
@ -633,7 +652,7 @@ struct ggml_tensor * ggml_soft_max(
|
|||
// if mode & 1 == 1, skip n_past elements
|
||||
// if mode & 2 == 1, GPT-NeoX style
|
||||
// TODO: avoid creating a new tensor every time
|
||||
struct ggml_tensor * ggml_rope(
|
||||
GGML_API struct ggml_tensor * ggml_rope(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int n_past,
|
||||
|
@ -644,24 +663,24 @@ struct ggml_tensor * ggml_rope(
|
|||
// TODO: we don't support extra parameters for now
|
||||
// that's why we are hard-coding the stride, padding, and dilation
|
||||
// not great ..
|
||||
struct ggml_tensor * ggml_conv_1d_1s(
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d_1s(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
struct ggml_tensor * ggml_conv_1d_2s(
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d_2s(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
struct ggml_tensor * ggml_flash_attn(
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
bool masked);
|
||||
|
||||
struct ggml_tensor * ggml_flash_ff(
|
||||
GGML_API struct ggml_tensor * ggml_flash_ff(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b0,
|
||||
|
@ -670,15 +689,15 @@ struct ggml_tensor * ggml_flash_ff(
|
|||
struct ggml_tensor * c1);
|
||||
|
||||
// Mapping operations
|
||||
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
||||
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
||||
GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
|
||||
GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
|
||||
|
||||
struct ggml_tensor * ggml_map_unary_f32(
|
||||
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
const ggml_unary_op_f32_t fun);
|
||||
|
||||
struct ggml_tensor * ggml_map_binary_f32(
|
||||
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
|
@ -688,23 +707,23 @@ struct ggml_tensor * ggml_map_binary_f32(
|
|||
// automatic differentiation
|
||||
//
|
||||
|
||||
void ggml_set_param(
|
||||
GGML_API void ggml_set_param(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * tensor);
|
||||
|
||||
void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
|
||||
struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||
struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
||||
|
||||
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
||||
|
||||
// print info and performance information for the graph
|
||||
void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
||||
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
|
||||
|
||||
// dump the graph into a file using the dot format
|
||||
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
||||
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
||||
|
||||
//
|
||||
// optimization
|
||||
|
@ -797,10 +816,10 @@ struct ggml_opt_params {
|
|||
} lbfgs;
|
||||
};
|
||||
|
||||
struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
||||
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
||||
|
||||
// optimize the function defined by the tensor f
|
||||
enum ggml_opt_result ggml_opt(
|
||||
GGML_API enum ggml_opt_result ggml_opt(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_opt_params params,
|
||||
struct ggml_tensor * f);
|
||||
|
@ -809,32 +828,32 @@ enum ggml_opt_result ggml_opt(
|
|||
// quantization
|
||||
//
|
||||
|
||||
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
|
||||
|
||||
//
|
||||
// system info
|
||||
//
|
||||
|
||||
int ggml_cpu_has_avx(void);
|
||||
int ggml_cpu_has_avx2(void);
|
||||
int ggml_cpu_has_avx512(void);
|
||||
int ggml_cpu_has_avx512_vbmi(void);
|
||||
int ggml_cpu_has_avx512_vnni(void);
|
||||
int ggml_cpu_has_fma(void);
|
||||
int ggml_cpu_has_neon(void);
|
||||
int ggml_cpu_has_arm_fma(void);
|
||||
int ggml_cpu_has_f16c(void);
|
||||
int ggml_cpu_has_fp16_va(void);
|
||||
int ggml_cpu_has_wasm_simd(void);
|
||||
int ggml_cpu_has_blas(void);
|
||||
int ggml_cpu_has_cublas(void);
|
||||
int ggml_cpu_has_sse3(void);
|
||||
int ggml_cpu_has_vsx(void);
|
||||
GGML_API int ggml_cpu_has_avx (void);
|
||||
GGML_API int ggml_cpu_has_avx2 (void);
|
||||
GGML_API int ggml_cpu_has_avx512 (void);
|
||||
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
||||
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
||||
GGML_API int ggml_cpu_has_fma (void);
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_API int ggml_cpu_has_f16c (void);
|
||||
GGML_API int ggml_cpu_has_fp16_va (void);
|
||||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
||||
GGML_API int ggml_cpu_has_blas (void);
|
||||
GGML_API int ggml_cpu_has_cublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
|
||||
|
||||
//
|
||||
|
|
|
@ -54,7 +54,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
|||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
};
|
||||
return _MEM_REQ_SCRATCH0;
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
|||
{ MODEL_7B, 512ull * MB },
|
||||
{ MODEL_13B, 512ull * MB },
|
||||
{ MODEL_30B, 512ull * MB },
|
||||
{ MODEL_65B, 512ull * MB },
|
||||
{ MODEL_65B, 1024ull * MB },
|
||||
};
|
||||
return _MEM_REQ_SCRATCH1;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue