Resolved merge conflicts.
This commit is contained in:
commit
3878230201
7 changed files with 247 additions and 94 deletions
14
.github/workflows/build.yml
vendored
14
.github/workflows/build.yml
vendored
|
@ -33,6 +33,20 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
make
|
make
|
||||||
|
|
||||||
|
windows-latest:
|
||||||
|
runs-on: windows-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
uses: actions/checkout@v1
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: |
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
cmake --build . --config Release
|
||||||
|
|
||||||
# ubuntu-latest-gcc:
|
# ubuntu-latest-gcc:
|
||||||
# runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
#
|
#
|
||||||
|
|
128
CMakeLists.txt
Normal file
128
CMakeLists.txt
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
cmake_minimum_required(VERSION 3.8)
|
||||||
|
project("llama.cpp")
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 20)
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||||
|
set(CMAKE_C_STANDARD 11)
|
||||||
|
|
||||||
|
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||||
|
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
||||||
|
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
||||||
|
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
|
||||||
|
|
||||||
|
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
|
||||||
|
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
|
||||||
|
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
||||||
|
|
||||||
|
if (APPLE)
|
||||||
|
option(LLAMA_NO_ACCELERATE "llama: disable Accelerate framework" OFF)
|
||||||
|
option(LLAMA_NO_AVX "llama: disable AVX" OFF)
|
||||||
|
option(LLAMA_NO_AVX2 "llama: disable AVX2" OFF)
|
||||||
|
option(LLAMA_NO_FMA "llama: disable FMA" OFF)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (NOT MSVC)
|
||||||
|
if (LLAMA_SANITIZE_THREAD)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_SANITIZE_ADDRESS)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_SANITIZE_UNDEFINED)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (APPLE AND NOT LLAMA_NO_ACCELERATE)
|
||||||
|
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
||||||
|
if (ACCELERATE_FRAMEWORK)
|
||||||
|
message(STATUS "Accelerate framework found")
|
||||||
|
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
|
||||||
|
set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
|
||||||
|
else()
|
||||||
|
message(WARNING "Accelerate framework not found")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_ALL_WARNINGS)
|
||||||
|
if (NOT MSVC)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
|
||||||
|
-Wall \
|
||||||
|
-Wextra \
|
||||||
|
-Wpedantic \
|
||||||
|
-Wshadow \
|
||||||
|
-Wcast-qual \
|
||||||
|
-Wstrict-prototypes \
|
||||||
|
-Wpointer-arith \
|
||||||
|
-Wno-unused-function \
|
||||||
|
")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
|
||||||
|
-Wall \
|
||||||
|
-Wextra \
|
||||||
|
-Wpedantic \
|
||||||
|
-Wcast-qual \
|
||||||
|
")
|
||||||
|
else()
|
||||||
|
# todo : msvc
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||||
|
|
||||||
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
|
||||||
|
message(STATUS "ARM detected")
|
||||||
|
else()
|
||||||
|
message(STATUS "x86 detected")
|
||||||
|
if (MSVC)
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
|
||||||
|
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
|
||||||
|
else()
|
||||||
|
if(NOT LLAMA_NO_AVX)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
|
||||||
|
endif()
|
||||||
|
if(NOT LLAMA_NO_AVX2)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
|
||||||
|
endif()
|
||||||
|
if(NOT LLAMA_NO_FMA)
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
|
||||||
|
endif()
|
||||||
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# if (LLAMA_PERF)
|
||||||
|
# set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF)
|
||||||
|
# endif()
|
||||||
|
|
||||||
|
add_executable(llama
|
||||||
|
main.cpp
|
||||||
|
utils.cpp
|
||||||
|
utils.h)
|
||||||
|
|
||||||
|
add_executable(quantize
|
||||||
|
quantize.cpp
|
||||||
|
utils.cpp
|
||||||
|
utils.h)
|
||||||
|
|
||||||
|
add_library(ggml
|
||||||
|
ggml.c
|
||||||
|
ggml.h)
|
||||||
|
|
||||||
|
target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS})
|
||||||
|
target_compile_definitions(llama PUBLIC ${LLAMA_EXTRA_FLAGS})
|
||||||
|
target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS})
|
||||||
|
|
||||||
|
target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
|
||||||
|
target_include_directories(ggml PUBLIC .)
|
||||||
|
target_link_libraries(quantize PRIVATE ggml)
|
||||||
|
target_link_libraries(llama PRIVATE ggml)
|
4
Makefile
4
Makefile
|
@ -48,6 +48,10 @@ ifeq ($(UNAME_S),FreeBSD)
|
||||||
CFLAGS += -pthread
|
CFLAGS += -pthread
|
||||||
CXXFLAGS += -pthread
|
CXXFLAGS += -pthread
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(UNAME_S),NetBSD)
|
||||||
|
CFLAGS += -pthread
|
||||||
|
CXXFLAGS += -pthread
|
||||||
|
endif
|
||||||
ifeq ($(UNAME_S),Haiku)
|
ifeq ($(UNAME_S),Haiku)
|
||||||
CFLAGS += -pthread
|
CFLAGS += -pthread
|
||||||
CXXFLAGS += -pthread
|
CXXFLAGS += -pthread
|
||||||
|
|
26
README.md
26
README.md
|
@ -5,11 +5,6 @@
|
||||||
|
|
||||||
Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in pure C/C++
|
Inference of [Facebook's LLaMA](https://github.com/facebookresearch/llama) model in pure C/C++
|
||||||
|
|
||||||
**Hot topics**
|
|
||||||
|
|
||||||
- Running on Windows: https://github.com/ggerganov/llama.cpp/issues/22
|
|
||||||
- Fix Tokenizer / Unicode support: https://github.com/ggerganov/llama.cpp/issues/11
|
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
The main goal is to run the model using 4-bit quantization on a MacBook
|
The main goal is to run the model using 4-bit quantization on a MacBook
|
||||||
|
@ -23,14 +18,14 @@ The main goal is to run the model using 4-bit quantization on a MacBook
|
||||||
|
|
||||||
This was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022) - I have no idea if it works correctly.
|
This was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022) - I have no idea if it works correctly.
|
||||||
Please do not make conclusions about the models based on the results from this implementation.
|
Please do not make conclusions about the models based on the results from this implementation.
|
||||||
For all I know, it can be completely wrong. This project is for educational purposes and is not going to be maintained properly.
|
For all I know, it can be completely wrong. This project is for educational purposes.
|
||||||
New features will probably be added mostly through community contributions, if any.
|
New features will probably be added mostly through community contributions.
|
||||||
|
|
||||||
Supported platforms:
|
Supported platforms:
|
||||||
|
|
||||||
- [X] Mac OS
|
- [X] Mac OS
|
||||||
- [X] Linux
|
- [X] Linux
|
||||||
- [ ] Windows (soon)
|
- [X] Windows (via CMake)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
@ -179,10 +174,6 @@ Note the use of `--color` to distinguish between user input and generated text.
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
||||||
- Not sure if my tokenizer is correct. There are a few places where we might have a mistake:
|
|
||||||
- https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/convert-pth-to-ggml.py#L79-L87
|
|
||||||
- https://github.com/ggerganov/llama.cpp/blob/26c084662903ddaca19bef982831bfb0856e8257/utils.h#L65-L69
|
|
||||||
In general, it seems to work, but I think it fails for unicode character support. Hopefully, someone can help with that
|
|
||||||
- I don't know yet how much the quantization affects the quality of the generated text
|
- I don't know yet how much the quantization affects the quality of the generated text
|
||||||
- Probably the token sampling can be improved
|
- Probably the token sampling can be improved
|
||||||
- The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder,
|
- The Accelerate framework is actually currently unused since I found that for tensor shapes typical for the Decoder,
|
||||||
|
@ -192,16 +183,15 @@ Note the use of `--color` to distinguish between user input and generated text.
|
||||||
|
|
||||||
### Contributing
|
### Contributing
|
||||||
|
|
||||||
- There are 2 git branches: [master](https://github.com/ggerganov/llama.cpp/commits/master) and [dev](https://github.com/ggerganov/llama.cpp/commits/dev)
|
- Contributors can open PRs
|
||||||
- Contributors can open PRs to either one
|
- Collaborators can push to branches in the `llama.cpp` repo
|
||||||
- Collaborators can push straight into `dev`, but need to open a PR to get stuff to `master`
|
|
||||||
- Collaborators will be invited based on contributions
|
- Collaborators will be invited based on contributions
|
||||||
- `dev` branch is considered unstable
|
|
||||||
- `master` branch is considered stable and approved. 3-rd party projects should use the `master` branch
|
|
||||||
|
|
||||||
General principles to follow when writing code:
|
### Coding guide-lines
|
||||||
|
|
||||||
- Avoid adding third-party dependencies, extra files, extra headers, etc.
|
- Avoid adding third-party dependencies, extra files, extra headers, etc.
|
||||||
- Always consider cross-compatibility with other operating systems and architectures
|
- Always consider cross-compatibility with other operating systems and architectures
|
||||||
- Avoid fancy looking modern STL constructs, use basic for loops, avoid templates, keep it simple
|
- Avoid fancy looking modern STL constructs, use basic for loops, avoid templates, keep it simple
|
||||||
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
|
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
|
||||||
|
- Clean-up any tailing whitespaces, use 4 spaces indentation, brackets on same line, `int * var`
|
||||||
|
- Look at the [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks
|
||||||
|
|
34
ggml.c
34
ggml.c
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
#elif !defined(__FreeBSD__)
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__)
|
||||||
#include <alloca.h>
|
#include <alloca.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -1360,34 +1360,20 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
|
||||||
const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
|
const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);
|
||||||
|
|
||||||
// dot product into int16x8_t
|
// dot product into int16x8_t
|
||||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
|
// assume that vdotq_s32 is always available, if not, should check for __ARM_FEATURE_DOTPROD
|
||||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
|
int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);
|
||||||
|
int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);
|
||||||
|
|
||||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
|
p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);
|
||||||
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
|
p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);
|
||||||
|
|
||||||
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
|
|
||||||
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
|
|
||||||
|
|
||||||
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
|
|
||||||
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
|
|
||||||
|
|
||||||
const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h);
|
|
||||||
const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h);
|
|
||||||
|
|
||||||
const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h);
|
|
||||||
const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h);
|
|
||||||
|
|
||||||
const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);
|
|
||||||
const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);
|
|
||||||
|
|
||||||
// scalar
|
// scalar
|
||||||
#if defined(__ARM_FEATURE_QRDMX)
|
#if defined(__ARM_FEATURE_QRDMX)
|
||||||
sum0 += d0_0*d1_0*vaddvq_s16(p_0);
|
sum0 += d0_0*d1_0*vaddvq_s32(p_0);
|
||||||
sum1 += d0_1*d1_1*vaddvq_s16(p_1);
|
sum1 += d0_1*d1_1*vaddvq_s32(p_1);
|
||||||
#else
|
#else
|
||||||
sum0 += d0_0*d1_0*(vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7));
|
sum0 += d0_0*d1_0*(vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3));
|
||||||
sum1 += d0_1*d1_1*(vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7));
|
sum1 += d0_1*d1_1*(vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
133
main.cpp
133
main.cpp
|
@ -86,9 +86,6 @@ struct llama_model {
|
||||||
std::map<std::string, struct ggml_tensor *> tensors;
|
std::map<std::string, struct ggml_tensor *> tensors;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#define USE_MMAP 1
|
|
||||||
|
|
||||||
#ifndef USE_MMAP
|
#ifndef USE_MMAP
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
#define USE_MMAP 1
|
#define USE_MMAP 1
|
||||||
|
@ -207,9 +204,13 @@ using llama_istream = std::ifstream;
|
||||||
|
|
||||||
// load the model's weights from a file
|
// load the model's weights from a file
|
||||||
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
|
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
|
||||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||||
|
|
||||||
llama_istream fin{fname};
|
llama_istream fin{fname, std::ios::binary};
|
||||||
|
#if !USE_MMAP
|
||||||
|
std::vector<char> f_buf(1024*1024);
|
||||||
|
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
||||||
|
#endif
|
||||||
if (!fin) {
|
if (!fin) {
|
||||||
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||||
return false;
|
return false;
|
||||||
|
@ -246,16 +247,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||||
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
|
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
|
||||||
|
|
||||||
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||||
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||||
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||||
printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
|
fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
|
||||||
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
|
||||||
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||||
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
|
fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
|
||||||
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
|
||||||
printf("%s: n_ff = %d\n", __func__, n_ff);
|
fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
|
||||||
printf("%s: n_parts = %d\n", __func__, n_parts);
|
fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
|
||||||
}
|
}
|
||||||
|
|
||||||
// load vocab
|
// load vocab
|
||||||
|
@ -280,7 +281,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
vocab.id_to_token[i] = word;
|
vocab.id_to_token[i] = word;
|
||||||
|
|
||||||
//if (i < 30000) {
|
//if (i < 30000) {
|
||||||
// printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
|
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
|
||||||
//}
|
//}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -339,7 +340,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
|
|
||||||
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
||||||
|
|
||||||
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||||
}
|
}
|
||||||
|
|
||||||
// create the ggml context
|
// create the ggml context
|
||||||
|
@ -426,7 +427,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
|
|
||||||
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
||||||
|
|
||||||
printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t file_offset = fin.tellg();
|
const size_t file_offset = fin.tellg();
|
||||||
|
@ -444,9 +445,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
fname_part += "." + std::to_string(i);
|
fname_part += "." + std::to_string(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
|
fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
|
||||||
|
|
||||||
llama_istream fin{fname_part};
|
llama_istream fin{fname_part, std::ios::binary};
|
||||||
|
#if !USE_MMAP
|
||||||
|
fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
|
||||||
|
#endif
|
||||||
fin.seekg(file_offset);
|
fin.seekg(file_offset);
|
||||||
|
|
||||||
// load weights
|
// load weights
|
||||||
|
@ -454,7 +458,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
int n_tensors = 0;
|
int n_tensors = 0;
|
||||||
size_t total_size = 0;
|
size_t total_size = 0;
|
||||||
|
|
||||||
printf("%s: ", __func__);
|
fprintf(stderr, "%s: ", __func__);
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
int32_t n_dims;
|
int32_t n_dims;
|
||||||
|
@ -554,7 +558,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
|
|
||||||
if (0) {
|
if (0) {
|
||||||
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
||||||
printf("%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
|
fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t bpe = 0;
|
size_t bpe = 0;
|
||||||
|
@ -617,16 +621,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
total_size += ggml_nbytes(tensor)/n_parts;
|
total_size += ggml_nbytes(tensor)/n_parts;
|
||||||
}
|
}
|
||||||
|
|
||||||
//printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
//fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||||
if (++n_tensors % 8 == 0) {
|
if (++n_tensors % 8 == 0) {
|
||||||
printf(".");
|
fprintf(stderr, ".");
|
||||||
fflush(stdout);
|
fflush(stderr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
printf(" done\n");
|
fprintf(stderr, " done\n");
|
||||||
|
|
||||||
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
||||||
}
|
}
|
||||||
|
|
||||||
fin.close();
|
fin.close();
|
||||||
|
@ -670,7 +674,7 @@ bool llama_eval(
|
||||||
|
|
||||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||||
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||||
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
//fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||||
|
|
||||||
// reallocate
|
// reallocate
|
||||||
buf_size = buf_size_new;
|
buf_size = buf_size_new;
|
||||||
|
@ -862,7 +866,7 @@ bool llama_eval(
|
||||||
if (mem_per_token == 0) {
|
if (mem_per_token == 0) {
|
||||||
mem_per_token = ggml_used_mem(ctx0)/N;
|
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||||
}
|
}
|
||||||
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
//fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
|
||||||
|
|
||||||
ggml_free(ctx0);
|
ggml_free(ctx0);
|
||||||
|
|
||||||
|
@ -883,6 +887,26 @@ void sigint_handler(int signo) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
const char * llama_print_system_info(void) {
|
||||||
|
static std::string s;
|
||||||
|
|
||||||
|
s = "";
|
||||||
|
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
||||||
|
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
||||||
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
||||||
|
s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
|
||||||
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||||
|
s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
|
||||||
|
s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
|
||||||
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||||
|
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||||
|
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||||
|
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||||
|
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||||
|
|
||||||
|
return s.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
const int64_t t_main_start_us = ggml_time_us();
|
const int64_t t_main_start_us = ggml_time_us();
|
||||||
|
@ -898,7 +922,7 @@ int main(int argc, char ** argv) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: seed = %d\n", __func__, params.seed);
|
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
std::mt19937 rng(params.seed);
|
||||||
if (params.prompt.empty()) {
|
if (params.prompt.empty()) {
|
||||||
|
@ -925,6 +949,13 @@ int main(int argc, char ** argv) {
|
||||||
t_load_us = ggml_time_us() - t_start_us;
|
t_load_us = ggml_time_us() - t_start_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// print system information
|
||||||
|
{
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
|
||||||
|
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||||
|
}
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
int64_t t_sample_us = 0;
|
int64_t t_sample_us = 0;
|
||||||
|
@ -940,13 +971,13 @@ int main(int argc, char ** argv) {
|
||||||
// tokenize the reverse prompt
|
// tokenize the reverse prompt
|
||||||
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
|
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
|
||||||
|
|
||||||
printf("\n");
|
fprintf(stderr, "\n");
|
||||||
printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
printf("%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
|
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
printf("\n");
|
fprintf(stderr, "\n");
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
struct sigaction sigint_action;
|
struct sigaction sigint_action;
|
||||||
|
@ -956,19 +987,19 @@ int main(int argc, char ** argv) {
|
||||||
sigaction(SIGINT, &sigint_action, NULL);
|
sigaction(SIGINT, &sigint_action, NULL);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
printf("%s: interactive mode on.\n", __func__);
|
fprintf(stderr, "%s: interactive mode on.\n", __func__);
|
||||||
|
|
||||||
if(antiprompt_inp.size()) {
|
if(antiprompt_inp.size()) {
|
||||||
printf("%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
|
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
|
||||||
printf("%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
|
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
|
||||||
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
|
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
|
||||||
printf("%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
|
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
printf("\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
|
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
|
||||||
printf("\n\n");
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
std::vector<gpt_vocab::id> embd;
|
std::vector<gpt_vocab::id> embd;
|
||||||
|
|
||||||
|
@ -982,7 +1013,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
printf("== Running in interactive mode. ==\n"
|
fprintf(stderr, "== Running in interactive mode. ==\n"
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
" - Press Ctrl+C to interject at any time.\n"
|
" - Press Ctrl+C to interject at any time.\n"
|
||||||
#endif
|
#endif
|
||||||
|
@ -1010,7 +1041,7 @@ int main(int argc, char ** argv) {
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_time_us();
|
||||||
|
|
||||||
if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
|
if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
|
||||||
printf("Failed to predict\n");
|
fprintf(stderr, "Failed to predict\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1123,7 +1154,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// end of text token
|
// end of text token
|
||||||
if (embd.back() == 2) {
|
if (embd.back() == 2) {
|
||||||
printf(" [end of text]\n");
|
fprintf(stderr, " [end of text]\n");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1133,12 +1164,12 @@ int main(int argc, char ** argv) {
|
||||||
{
|
{
|
||||||
const int64_t t_main_end_us = ggml_time_us();
|
const int64_t t_main_end_us = ggml_time_us();
|
||||||
|
|
||||||
printf("\n\n");
|
fprintf(stderr, "\n\n");
|
||||||
printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
|
fprintf(stderr, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
|
||||||
printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
|
||||||
printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
|
fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
|
||||||
printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
|
fprintf(stderr, "%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
|
||||||
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
|
|
|
@ -11,7 +11,7 @@
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
#elif !defined(__FreeBSD__)
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__)
|
||||||
#include <alloca.h>
|
#include <alloca.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue