Merge branch 'master' into interactive-eos-fix

This commit is contained in:
rabidcopy 2023-03-22 01:54:07 -05:00 committed by GitHub
commit 6fb0db31d7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 2044 additions and 1773 deletions

View file

@ -108,7 +108,7 @@ jobs:
cd build
cmake ..
cmake --build . --config Release
ctest --output-on-failure
ctest -C Release --output-on-failure
- name: Get commit hash
id: commit
@ -119,7 +119,7 @@ jobs:
id: pack_artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: |
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\*
7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\bin\Release\*
- name: Create release
id: create_release

View file

@ -207,15 +207,10 @@ else()
message(STATUS "Unknown architecture")
endif()
#
# Build library
# Build libraries
#
add_executable(llama main.cpp)
add_executable(quantize quantize.cpp)
add_library(utils OBJECT
utils.cpp
utils.h)
@ -229,14 +224,24 @@ add_library(ggml OBJECT
target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_11) # don't bump
#
# Linking
#
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
target_link_libraries(llama PRIVATE ggml utils)
target_link_libraries(quantize PRIVATE ggml utils)
add_library(llama OBJECT
llama.cpp
llama.h)
target_include_directories(llama PUBLIC .)
target_compile_features(llama PUBLIC cxx_std_11) # don't bump
#
# Executables
#
add_executable(main main.cpp)
target_link_libraries(main PRIVATE llama ggml utils)
add_executable(quantize quantize.cpp)
target_link_libraries(quantize PRIVATE llama ggml utils)
#
# programs, examples and tests

View file

@ -220,18 +220,21 @@ default: main quantize
ggml.o: ggml.c ggml.h
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
llama.o: llama.cpp llama.h
$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
utils.o: utils.cpp utils.h
$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
clean:
rm -f *.o main quantize
main: main.cpp ggml.o utils.o
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
main: main.cpp ggml.o llama.o utils.o
$(CXX) $(CXXFLAGS) main.cpp ggml.o llama.o utils.o -o main $(LDFLAGS)
@echo "\x1b[36mrun ./main -h for help\x1b[0m"
quantize: quantize.cpp ggml.o utils.o
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
quantize: quantize.cpp ggml.o llama.o utils.o
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o llama.o utils.o -o quantize $(LDFLAGS)
#
# Tests

View file

@ -7,13 +7,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Hot topics:**
- New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
- [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
**TEMPORARY NOTICE:**
If you're updating to the latest master, you will need to regenerate your model files as the format has changed.
## Description
The main goal is to run the model using 4-bit quantization on a MacBook
@ -228,6 +226,20 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
>
```
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
* The LLaMA models are officially distributed by Facebook and will never be provided through this repository. See this [Pull Request in Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to obtain access to the model data.
* Please verify the sha256 checksums of all of your `consolidated*.pth` and corresponding converted `ggml-model-*.bin` model files to confirm that you have the correct model data files before creating an issue relating to your model files.
The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
`sha256sum --ignore-missing -c SHA256SUMS` on Linux
or
`shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS
### Android
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).

53
SHA256SUMS Normal file
View file

@ -0,0 +1,53 @@
700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth
abe4aec2cdc297e2916011f66c7efd6fb4424e0e84315503005b5c118358cc22 models/7B/ggml-model-f16.bin
f495fa02a0b5ef265e1864d9680eede7fd23a60b0a2f93edba8091e2a4ca68b9 models/7B/ggml-model-q4_0.bin
7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json
745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth
d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth
a6bd0537c6873f36c47292df0b6f794e1135f5aafb89c3343bcc9e93264bf167 models/13B/ggml-model-f16.bin
0fb0951b90f2ec46c1f2f2372af5dacb4614b27e9fb6c10c69fbec58d7dd0e36 models/13B/ggml-model-f16.bin.1
1c218ba37ae61e15e35efd9949c78d6edf553b6280824c263cad56ae0b9d5a8f models/13B/ggml-model-q4_0.bin
c37a20c2ab9fa74b006b389085660269ee06110d1e45a494eb57d4602c9bcdb2 models/13B/ggml-model-q4_0.bin.1
4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json
e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth
4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth
24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378 models/30B/consolidated.02.pth
1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b models/30B/consolidated.03.pth
def20ea508f4e36793719f857471e85b85f96e497a2cbffbbaa1b60e2b18202c models/30B/ggml-model-f16.bin
b37040aa67fa8608cb2d8e0719132cf3e267fd35ec1e2f0d37dbc9fa43d674f1 models/30B/ggml-model-f16.bin.1
e7f263557e99069fe29003262ea5fa9ed885dbe79069083e6eb569b328cf30d3 models/30B/ggml-model-f16.bin.2
2ad6a23af05eb720f202f63d130f4fc5de9b6d2efc95b921be003209a56695aa models/30B/ggml-model-f16.bin.3
7de31d005e6d02ebd9603b2cf5329ad2f832b65d08873a098c5cafc4046cb9ed models/30B/ggml-model-q4_0.bin
f91feef9f30f9a023616db2e91297ca6d5d5d7b9eb351e452a82115c46f7da9e models/30B/ggml-model-q4_0.bin.1
66f3a0916ac7a81839153eb061fa861030ed1892477c2f7af2ce4f98d2f6d06f models/30B/ggml-model-q4_0.bin.2
e3c587ba97f83d2088b001bcda3026571065649ee3090bef6743a51390b01d3b models/30B/ggml-model-q4_0.bin.3
2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb models/30B/params.json
135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe models/65B/consolidated.00.pth
9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde models/65B/consolidated.01.pth
e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770 models/65B/consolidated.02.pth
73176ffb426b40482f2aa67ae1217ef79fbbd1fff5482bae5060cdc5a24ab70e models/65B/consolidated.03.pth
882e6431d0b08a8bc66261a0d3607da21cbaeafa96a24e7e59777632dbdac225 models/65B/consolidated.04.pth
a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78 models/65B/consolidated.05.pth
72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b models/65B/consolidated.06.pth
d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638 models/65B/consolidated.07.pth
7eba2625260cd91f8de901fd9704a1aa39448425514a335a0d3878de4ab9dc77 models/65B/ggml-model-f16.bin
f6aa886575df0785d4231f30cc776d499ccde18857818effc0378c65b178e0b5 models/65B/ggml-model-f16.bin.1
076037141682f5d7537955058c4740ab27f285aa4588915f830874a589c0693d models/65B/ggml-model-f16.bin.2
7853d96d2903ad7de2b2a89c4acf5a33a2f8e3c24ac39c9df6b44cdb42bf530a models/65B/ggml-model-f16.bin.3
b16b7b941abb3bc03a14df1656140855e9360a5371c83e919b9da83a72362314 models/65B/ggml-model-f16.bin.4
5291270216f888697695acb78ef28df0c080f9e85d3245c92fb9992d1fde6678 models/65B/ggml-model-f16.bin.5
0685ee77715f34686841006f8f94d3e7eaf148b97cecc9d3eee72808b0f7989c models/65B/ggml-model-f16.bin.6
00d993d73bb21d7c29388ffe0dced008cbaa0d391831dea77d7eb8f0b5c404b9 models/65B/ggml-model-f16.bin.7
4e398f05842206e08cdc5e7bb4f6c7c34b9dc373435ece6f261b14b7b4fe9b89 models/65B/ggml-model-q4_0.bin
4c4e899e3b12d9f57c9dcea5a1fb41bbc72023323535551f6273582ca7d7294b models/65B/ggml-model-q4_0.bin.1
d7b4594bbbd192043b3db0e5acc2561c42e6944e1cb91cc6e61510eee89dbcd8 models/65B/ggml-model-q4_0.bin.2
9a099d271648863d923d0d097391ea0bc75591f27a2ca3a327760f42e6b69af2 models/65B/ggml-model-q4_0.bin.3
5ee474051e418c5732b7949190b084d9d679db447f83c1de0d2a82daaa1a0cfa models/65B/ggml-model-q4_0.bin.4
a45aa05e7212bd6782790722d68056c5419667ea6b564ccc94bbcb8111d79b8b models/65B/ggml-model-q4_0.bin.5
a58fda714b759c28ad5e4c1d8bf8fda7b158fd5e4c4a49f851f36342fa97a105 models/65B/ggml-model-q4_0.bin.6
a3540cfcbcda33c223c6b0d606034adbd78f17e0e5de1582b78795e78754f7a8 models/65B/ggml-model-q4_0.bin.7
999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b models/65B/params.json
1f582babc2bd56bb63b33141898748657d369fd110c4358b2bc280907882bf13 models/alpaca-7B/ggml-model-q4_0.bin
e17730c6b62b565b098af023ca446dcb9e3535d4222ead6369c7aae67207eb3d models/alpaca-13B/ggml-model-q4_0.bin
9bcd1bb30e679c939f367be11b030fe20b3eb9a3606b9bc4106420f1827b6ae4 models/alpaca-30B/ggml-model-q4_0.bin
36079249f53c292a4c2302d7784005dcae94c865f0bedfdbfa51d9ddad402935 models/alpaca-30B/params.json

View file

@ -148,7 +148,7 @@ def main():
model = torch.load(fname_model, map_location="cpu")
with open(fname_out, "wb") as fout:
fout.write(struct.pack("i", hparams["vocab_size"]))
write_header(fout, hparams, ftype)
write_tokens(fout, tokenizer)
del model

View file

@ -25,7 +25,7 @@ GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --
--prompt "
Text transcript of a never ending dialog, where ${USER_NAME} interacts with an AI assistant named ${AI_NAME}.
${AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer ${USER_NAME}s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what ${USER_NAME} and ${AI_NAME} say alound to each other.
There are no annotations like (30 seconds passed...) or (to himself), just what ${USER_NAME} and ${AI_NAME} say aloud to each other.
The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
The transcript only includes text, it does not include markup like HTML and Markdown.

121
ggml.c
View file

@ -10702,6 +10702,127 @@ enum ggml_opt_result ggml_opt(
////////////////////////////////////////////////////////////////////////////////
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs;
assert(k % qk == 0);
const size_t pp_size = qk / 2;
uint8_t * pp = (uint8_t *) alloca(pp_size);
char * pdst = (char *) dst;
for (int j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
{
for (int l = 0; l < qk; l++) {
const float v = src[j + i*qk + l];
amax = MAX(amax, fabsf(v));
}
const float d = amax / ((1 << 3) - 1);
const float id = d ? 1.0f/d : 0.0f;
*(float *) pd = d;
pd += bs;
for (int l = 0; l < qk; l += 2) {
const float v0 = (src[j + i*qk + l + 0])*id;
const float v1 = (src[j + i*qk + l + 1])*id;
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);
hist[vi0]++;
hist[vi1]++;
pp[l/2] = vi0 | (vi1 << 4);
}
memcpy(pb, pp, pp_size);
pb += bs;
}
}
}
return (n/k)*row_size;
}
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs;
assert(k % qk == 0);
const size_t pp_size = qk / 2;
uint8_t * pp = (uint8_t *) alloca(pp_size);
char * pdst = (char *) dst;
for (int j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
for (int i = 0; i < nb; i++) {
float min = FLT_MAX;
float max = -FLT_MAX;
{
for (int l = 0; l < qk; l++) {
const float v = src[j + i*qk + l];
if (v < min) min = v;
if (v > max) max = v;
}
const float d = (max - min) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f;
*(float *) pd = d;
*(float *) pm = min;
pd += bs;
pm += bs;
for (int l = 0; l < qk; l += 2) {
const float v0 = (src[j + i*qk + l + 0] - min)*id;
const float v1 = (src[j + i*qk + l + 1] - min)*id;
const uint8_t vi0 = round(v0);
const uint8_t vi1 = round(v1);
assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);
hist[vi0]++;
hist[vi1]++;
pp[l/2] = vi0 | (vi1 << 4);
}
memcpy(pb, pp, pp_size);
pb += bs;
}
}
}
return (n/k)*row_size;
}
////////////////////////////////////////////////////////////////////////////////
int ggml_cpu_has_avx(void) {
#if defined(__AVX__)
return 1;

7
ggml.h
View file

@ -741,6 +741,13 @@ enum ggml_opt_result ggml_opt(
struct ggml_opt_params params,
struct ggml_tensor * f);
//
// quantization
//
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
//
// system info
//

1569
llama.cpp Normal file

File diff suppressed because it is too large Load diff

139
llama.h Normal file
View file

@ -0,0 +1,139 @@
#ifndef LLAMA_H
#define LLAMA_H
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#ifdef LLAMA_SHARED
# ifdef _WIN32
# ifdef LLAMA_BUILD
# define LLAMA_API __declspec(dllexport)
# else
# define LLAMA_API __declspec(dllimport)
# endif
# else
# define LLAMA_API __attribute__ ((visibility ("default")))
# endif
#else
# define LLAMA_API
#endif
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
#ifdef __cplusplus
extern "C" {
#endif
//
// C interface
//
// TODO: show sample usage
//
struct llama_context;
typedef int llama_token;
typedef struct llama_token_data {
llama_token id; // token id
float p; // probability of the token
float plog; // log probability of the token
} llama_token_data;
struct llama_context_params {
int n_ctx; // text context
int n_parts; // -1 for default
int seed; // RNG seed, 0 for random
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights
};
LLAMA_API struct llama_context_params llama_context_default_params();
// Various functions for loading a ggml llama model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure
LLAMA_API struct llama_context * llama_init_from_file(
const char * path_model,
struct llama_context_params params);
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
// TODO: not great API - very likely to change
// Returns 0 on success
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype,
int qk);
// Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls
// Returns 0 on success
LLAMA_API int llama_eval(
struct llama_context * ctx,
const llama_token * tokens,
int n_tokens,
int n_past,
int n_threads);
// Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens
// Returns a negative number on failure - the number of tokens that would have been returned
// TODO: not sure if correct
LLAMA_API int llama_tokenize(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token
// Rows: n_tokens
// Cols: n_vocab
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos();
LLAMA_API llama_token llama_token_eos();
// TODO: improve the last_n_tokens interface ?
LLAMA_API llama_token llama_sample_top_p_top_k(
llama_context * ctx,
const llama_token * last_n_tokens_data,
int last_n_tokens_size,
int top_k,
double top_p,
double temp,
double repeat_penalty);
// Performance information
LLAMA_API void llama_print_timings(struct llama_context * ctx);
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
// Print system information
LLAMA_API const char * llama_print_system_info(void);
#ifdef __cplusplus
}
#endif
#endif

942
main.cpp

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -1,319 +1,17 @@
#include "ggml.h"
#include "llama.h"
#include "utils.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <string>
#include <vector>
#include <regex>
// TODO: move somewhere else
#define QK 32
// default hparams (LLaMA76B)
struct llama_hparams {
int32_t n_vocab = 32000;
int32_t n_ctx = 512; // this is provided as user input?
int32_t n_embd = 4096;
int32_t n_mult = 256;
int32_t n_head = 32;
int32_t n_layer = 32;
int32_t n_rot = 64;
int32_t f16 = 1;
};
// quantize a model
bool llama_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype) {
ggml_type type = GGML_TYPE_Q4_1;
switch (itype) {
case 2: type = GGML_TYPE_Q4_0; break;
case 3: type = GGML_TYPE_Q4_1; break;
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
};
if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
return false;
}
llama_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
auto finp = std::ifstream(fname_inp, std::ios::binary);
if (!finp) {
fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
return false;
}
auto fout = std::ofstream(fname_out, std::ios::binary);
if (!fout) {
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
return false;
}
// verify magic
{
uint32_t magic;
finp.read((char *) &magic, sizeof(magic));
if (magic == FILE_MAGIC_UNVERSIONED) {
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
__func__, fname_inp.c_str());
return false;
}
if (magic != FILE_MAGIC) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
return false;
}
fout.write((char *) &magic, sizeof(magic));
uint32_t format_version;
finp.read((char *) &format_version, sizeof(format_version));
if (format_version != FILE_VERSION) {
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
__func__, fname_inp.c_str(), format_version, FILE_VERSION);
return false;
}
fout.write((char *) &format_version, sizeof(format_version));
}
llama_hparams hparams;
// load hparams
{
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
//finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: f16 = %d\n", __func__, hparams.f16);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
//fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fout.write((char *) &itype, sizeof(hparams.f16));
}
// load vocab
{
const int32_t n_vocab = hparams.n_vocab;
if (n_vocab != hparams.n_vocab) {
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
__func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
return false;
}
std::string word;
vocab.id_to_token.resize(n_vocab);
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
finp.read ((char *) &len, sizeof(len));
fout.write((char *) &len, sizeof(len));
word.resize(len);
finp.read ((char *) word.data(), len);
fout.write((char *) word.data(), len);
float score;
finp.read ((char *) &score, sizeof(score));
fout.write((char *) &score, sizeof(score));
vocab.token_to_id[word] = i;
auto &tok_score = vocab.id_to_token[i];
tok_score.tok = word;
tok_score.score = score;
}
}
// load weights
{
size_t total_size_org = 0;
size_t total_size_new = 0;
std::vector<float> work;
std::vector<uint8_t> data_u8;
std::vector<ggml_fp16_t> data_f16;
std::vector<float> data_f32;
std::vector<int64_t> hist_all(1 << 4, 0);
while (true) {
int32_t n_dims;
int32_t length;
int32_t ftype;
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
if (finp.eof()) {
break;
}
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
std::string name(length, 0);
finp.read (&name[0], length);
{
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
}
// regexes of tensor names to be quantized
const std::vector<std::string> k_names = {
".*weight",
};
bool quantize = false;
for (const auto & s : k_names) {
if (std::regex_match(name, std::regex(s))) {
quantize = true;
break;
}
}
// quantize only 2D tensors
quantize &= (n_dims == 2);
if (quantize) {
if (ftype != 0 && ftype != 1) {
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
return false;
}
if (ftype == 1) {
data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
data_f32.resize(nelements);
for (int i = 0; i < nelements; ++i) {
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
}
} else {
data_f32.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
}
ftype = itype;
} else {
const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
data_u8.resize(nelements*bpe);
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
}
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
for (int i = 0; i < n_dims; ++i) {
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
}
fout.write(&name[0], length);
if (quantize) {
printf("quantizing .. ");
work.resize(nelements); // for quantization
size_t cur_size = 0;
std::vector<int64_t> hist_cur(1 << 4, 0);
switch (type) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
} break;
case GGML_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
} break;
default:
{
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
return false;
}
}
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
total_size_new += cur_size;
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
for (int i = 0; i < hist_cur.size(); ++i) {
hist_all[i] += hist_cur[i];
}
for (int i = 0; i < hist_cur.size(); ++i) {
printf("%5.3f ", hist_cur[i] / (float)nelements);
}
printf("\n");
} else {
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
total_size_new += data_u8.size();
}
total_size_org += nelements * sizeof(float);
}
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
{
int64_t sum_all = 0;
for (int i = 0; i < hist_all.size(); ++i) {
sum_all += hist_all[i];
}
printf("%s: hist: ", __func__);
for (int i = 0; i < hist_all.size(); ++i) {
printf("%5.3f ", hist_all[i] / (float)sum_all);
}
printf("\n");
}
}
finp.close();
fout.close();
return true;
}
const int QK = 32;
// usage:
// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
//
int main(int argc, char ** argv) {
ggml_time_init();
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
fprintf(stderr, " type = 2 - q4_0\n");
@ -341,7 +39,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();
if (!llama_model_quantize(fname_inp, fname_out, itype)) {
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}

View file

@ -1,4 +1,4 @@
set(TEST_TARGET test-tokenizer-0)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE utils)
target_link_libraries(${TEST_TARGET} PRIVATE llama ggml utils)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)

View file

@ -1,10 +1,11 @@
#include "utils.h"
#include "llama.h"
#include <cstdio>
#include <string>
#include <map>
static const std::map<std::string, std::vector<llama_vocab::id>> k_tests = {
static const std::map<std::string, std::vector<llama_token>> k_tests = {
{ "Hello World", { 1, 10994, 2787, }, },
{ " Hello World", { 1, 15043, 2787, }, },
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
@ -23,14 +24,23 @@ int main(int argc, char **argv) {
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
llama_vocab vocab;
llama_context * ctx;
if (!llama_vocab_load(fname, vocab)) {
fprintf(stderr, "%s : failed to load vocab from: '%s'\n", __func__, fname.c_str());
return 1;
// load the vocab
{
auto lparams = llama_context_default_params();
lparams.vocab_only = true;
ctx = llama_init_from_file(fname.c_str(), lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
return 1;
}
}
const int n_vocab = vocab.id_to_token.size();
const int n_vocab = llama_n_vocab(ctx);
if (n_vocab != 32000) {
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
@ -38,7 +48,7 @@ int main(int argc, char **argv) {
}
for (const auto & test_kv : k_tests) {
const auto res = llama_tokenize(vocab, test_kv.first, true);
const auto res = ::llama_tokenize(ctx, test_kv.first, true);
bool correct = res.size() == test_kv.second.size();

521
utils.cpp
View file

@ -3,12 +3,9 @@
#include <cassert>
#include <cstring>
#include <fstream>
#include <regex>
#include <iostream>
#include <iterator>
#include <queue>
#include <string>
#include <math.h>
#include <iterator>
#include <algorithm>
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW
@ -104,7 +101,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT (can be\n");
fprintf(stderr, " specified more than once for multiple prompts).\n");
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for <= 0)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: empty)\n");
@ -147,509 +144,11 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
return "The";
}
void replace(std::string & str, const std::string & needle, const std::string & replacement) {
size_t pos = 0;
while ((pos = str.find(needle, pos)) != std::string::npos) {
str.replace(pos, needle.length(), replacement);
pos += replacement.length();
}
}
std::unordered_map<std::string, int32_t> json_parse(const std::string & fname) {
std::unordered_map<std::string, int32_t> result;
// read file into string
std::string json;
{
std::ifstream ifs(fname);
if (!ifs) {
fprintf(stderr, "Failed to open %s\n", fname.c_str());
exit(1);
}
json = std::string((std::istreambuf_iterator<char>(ifs)),
(std::istreambuf_iterator<char>()));
}
if (json[0] != '{') {
return result;
}
// parse json
{
bool has_key = false;
bool in_token = false;
std::string str_key = "";
std::string str_val = "";
int n = json.size();
for (int i = 1; i < n; ++i) {
if (!in_token) {
if (json[i] == ' ') continue;
if (json[i] == '"') {
in_token = true;
continue;
}
} else {
if (json[i] == '\\' && i+1 < n) {
if (has_key == false) {
str_key += json[i];
} else {
str_val += json[i];
}
++i;
} else if (json[i] == '"') {
if (has_key == false) {
has_key = true;
++i;
while (json[i] == ' ') ++i;
++i; // :
while (json[i] == ' ') ++i;
if (json[i] != '\"') {
while (json[i] != ',' && json[i] != '}') {
str_val += json[i++];
}
has_key = false;
} else {
in_token = true;
continue;
}
} else {
has_key = false;
}
::replace(str_key, "\\u0120", " " ); // \u0120 -> space
::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
::replace(str_key, "\\\"", "\""); // \\\" -> "
try {
result[str_key] = std::stoi(str_val);
} catch (...) {
//fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
}
str_key = "";
str_val = "";
in_token = false;
continue;
}
if (has_key == false) {
str_key += json[i];
} else {
str_val += json[i];
}
}
}
}
return result;
}
static size_t utf8_len(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
return lookup[highbits];
}
struct llama_sp_symbol {
using index = int;
index prev;
index next;
const char * text;
size_t n;
};
struct llama_sp_bigram {
struct comparator {
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
return (l.score < r.score) || (l.score == r.score && l.left > r.left);
}
};
using queue_storage = std::vector<llama_sp_bigram>;
using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
llama_sp_symbol::index left;
llama_sp_symbol::index right;
float score;
size_t size;
};
// original implementation:
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
struct llama_tokenizer {
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
// split string into utf8 chars
int index = 0;
size_t offs = 0;
while (offs < text.size()) {
llama_sp_symbol sym;
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
sym.text = text.c_str() + offs;
sym.n = char_len;
offs += char_len;
sym.prev = index - 1;
sym.next = offs == text.size() ? -1 : index + 1;
index++;
symbols_.emplace_back(std::move(sym));
}
// seed the work queue with all possible 2-character tokens.
for (size_t i = 1; i < symbols_.size(); ++i) {
try_add_bigram(i - 1, i);
}
// keep substituting the highest frequency pairs for as long as we can.
while (!work_queue_.empty()) {
auto bigram = work_queue_.top();
work_queue_.pop();
auto & left_sym = symbols_[bigram.left];
auto & right_sym = symbols_[bigram.right];
// if one of the symbols already got merged, skip it.
if (left_sym.n == 0 || right_sym.n == 0 ||
left_sym.n + right_sym.n != bigram.size) {
continue;
}
// merge the right sym into the left one
left_sym.n += right_sym.n;
right_sym.n = 0;
//printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
// remove the right sym from the chain
left_sym.next = right_sym.next;
if (right_sym.next >= 0) {
symbols_[right_sym.next].prev = bigram.left;
}
// find more substitutions
try_add_bigram(left_sym.prev, bigram.left);
try_add_bigram(bigram.left, left_sym.next);
}
for (int i = 0; i != -1; i = symbols_[i].next) {
auto & symbol = symbols_[i];
auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));
if (token == vocab_.token_to_id.end()) {
// output any symbols that did not form tokens as bytes.
for (int j = 0; j < (int) symbol.n; ++j) {
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
output.push_back(token_id);
}
} else {
output.push_back((*token).second);
}
}
}
private:
void try_add_bigram(int left, int right) {
if (left == -1 || right == -1) {
return;
}
const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
auto token = vocab_.token_to_id.find(text);
if (token == vocab_.token_to_id.end()) {
return;
}
if (static_cast<size_t>((*token).second) >= vocab_.id_to_token.size()) {
return;
}
const auto &tok_score = vocab_.id_to_token[(*token).second];
llama_sp_bigram bigram;
bigram.left = left;
bigram.right = right;
bigram.score = tok_score.score;
bigram.size = text.size();
work_queue_.push(bigram);
}
const llama_vocab & vocab_;
std::vector<llama_sp_symbol> symbols_;
llama_sp_bigram::queue work_queue_;
};
// TODO: temporary code duplication with llama.cpp
// will resolve after #77 is merged
bool llama_vocab_load(const std::string & fname, llama_vocab & vocab) {
std::ifstream fin(fname, std::ios::binary);
if (!fin.is_open()) {
return false;
}
int n_vocab = 0;
fin.read((char *) &n_vocab, sizeof(n_vocab));
std::string word;
std::vector<char> tmp(64);
vocab.id_to_token.resize(n_vocab);
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
fin.read((char *) &len, sizeof(len));
word.resize(len);
if (len > 0) {
tmp.resize(len);
fin.read(tmp.data(), len);
word.assign(tmp.data(), len);
} else {
word.clear();
}
float score;
fin.read((char *) &score, sizeof(score));
vocab.token_to_id[word] = i;
auto &tok_score = vocab.id_to_token[i];
tok_score.tok = word;
tok_score.score = score;
}
return true;
}
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
llama_tokenizer tokenizer(vocab);
std::vector<llama_vocab::id> output;
if (text.size() == 0) {
return output;
}
if (bos) {
output.push_back(1);
}
tokenizer.tokenize(text, output);
return output;
}
void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k) {
// find the top K tokens
std::partial_sort(
logits_id.begin(),
logits_id.begin() + top_k, logits_id.end(),
[](const std::pair<double, llama_vocab::id> & a, const std::pair<double, llama_vocab::id> & b) {
return a.first > b.first;
});
logits_id.resize(top_k);
}
llama_vocab::id llama_sample_top_p_top_k(
const llama_vocab & vocab,
const float * logits,
std::vector<llama_vocab::id> & last_n_tokens,
double repeat_penalty,
int top_k,
double top_p,
double temp,
std::mt19937 & rng) {
int n_logits = vocab.id_to_token.size();
std::vector<std::pair<double, llama_vocab::id>> logits_id;
logits_id.reserve(n_logits);
{
const double scale = 1.0/temp;
for (int i = 0; i < n_logits; ++i) {
// repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
if (logits[i] < 0.0) {
logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i));
} else {
logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i));
}
} else {
logits_id.push_back(std::make_pair(logits[i]*scale, i));
}
}
}
sample_top_k(logits_id, top_k);
double maxl = -INFINITY;
for (const auto & kv : logits_id) {
maxl = std::max(maxl, kv.first);
}
// compute probs for the top K tokens
std::vector<double> probs;
probs.reserve(logits_id.size());
double sum = 0.0;
for (const auto & kv : logits_id) {
double p = exp(kv.first - maxl);
probs.push_back(p);
sum += p;
}
// normalize the probs
for (auto & p : probs) {
p /= sum;
}
if (top_p < 1.0f) {
double cumsum = 0.0f;
for (int i = 0; i < (int) probs.size(); i++) {
cumsum += probs[i];
if (cumsum >= top_p) {
probs.resize(i + 1);
logits_id.resize(i + 1);
break;
}
}
cumsum = 1.0/cumsum;
for (int i = 0; i < (int) probs.size(); i++) {
probs[i] *= cumsum;
}
}
//printf("\n");
//for (int i = 0; i < (int) 10; i++) {
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
//}
//printf("\n\n");
//exit(0);
std::discrete_distribution<> dist(probs.begin(), probs.end());
int idx = dist(rng);
return logits_id[idx].second;
}
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs;
assert(k % qk == 0);
const size_t pp_size = qk / 2;
uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
char * pdst = (char *) dst;
for (int j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
for (int i = 0; i < nb; i++) {
float amax = 0.0f; // absolute max
{
for (int l = 0; l < qk; l++) {
const float v = src[j + i*qk + l];
amax = std::max(amax, fabsf(v));
}
const float d = amax / ((1 << 3) - 1);
const float id = d ? 1.0f/d : 0.0f;
*(float *) pd = d;
pd += bs;
for (int l = 0; l < qk; l += 2) {
const float v0 = (src[j + i*qk + l + 0])*id;
const float v1 = (src[j + i*qk + l + 1])*id;
const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);
hist[vi0]++;
hist[vi1]++;
pp[l/2] = vi0 | (vi1 << 4);
}
memcpy(pb, pp, pp_size);
pb += bs;
}
}
}
return (n/k)*row_size;
}
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
const int nb = k / qk;
const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
const size_t row_size = nb*bs;
assert(k % qk == 0);
const size_t pp_size = qk / 2;
uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
char * pdst = (char *) dst;
for (int j = 0; j < n; j += k) {
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
//printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
for (int i = 0; i < nb; i++) {
float min = std::numeric_limits<float>::max();
float max = std::numeric_limits<float>::min();
{
for (int l = 0; l < qk; l++) {
const float v = src[j + i*qk + l];
if (v < min) min = v;
if (v > max) max = v;
}
const float d = (max - min) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f;
*(float *) pd = d;
*(float *) pm = min;
pd += bs;
pm += bs;
for (int l = 0; l < qk; l += 2) {
const float v0 = (src[j + i*qk + l + 0] - min)*id;
const float v1 = (src[j + i*qk + l + 1] - min)*id;
const uint8_t vi0 = round(v0);
const uint8_t vi1 = round(v1);
assert(vi0 >= 0 && vi0 < 16);
assert(vi1 >= 0 && vi1 < 16);
hist[vi0]++;
hist[vi1]++;
pp[l/2] = vi0 | (vi1 << 4);
}
memcpy(pb, pp, pp_size);
pb += bs;
}
}
}
return (n/k)*row_size;
// TODO: not great allocating this every time
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
std::vector<llama_token> res(8096);
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
res.resize(n);
return res;
}

61
utils.h
View file

@ -2,8 +2,9 @@
#pragma once
#include "llama.h"
#include <string>
#include <unordered_map>
#include <vector>
#include <random>
#include <thread>
@ -49,64 +50,8 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
//
// Model file parsing
//
#define FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
#define FILE_MAGIC 0x67676d66 // 'ggmf' in hex
#define FILE_VERSION 1
//
// Vocab utils
//
struct llama_vocab {
using id = int32_t;
using token = std::string;
struct token_score {
token tok;
float score;
};
std::unordered_map<token, id> token_to_id;
std::vector<token_score> id_to_token;
};
void replace(std::string & str, const std::string & needle, const std::string & replacement);
// poor-man's JSON parsing
std::unordered_map<std::string, int32_t> json_parse(const std::string & fname);
// TODO: temporary until #77 is merged, need this now for some tokenizer tests
bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
// ref: https://github.com/google/sentencepiece
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos);
// sample next token given probabilities for each embedding
//
// - consider only the top K tokens
// - from them, consider only the top tokens with cumulative probability > P
//
llama_vocab::id llama_sample_top_p_top_k(
const llama_vocab & vocab,
const float * logits,
std::vector<llama_vocab::id> & last_n_tokens,
double repeat_penalty,
int top_k,
double top_p,
double temp,
std::mt19937 & rng);
// filer to top K tokens from list of logits
void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k);
//
// Quantization
//
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);