Merge remote-tracking branch 'origin/master' into cli-ui-update
This commit is contained in:
commit
414b66fcc4
20 changed files with 485 additions and 176 deletions
|
@ -15,4 +15,4 @@ FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
COPY --from=build /app/main /main
|
COPY --from=build /app/main /main
|
||||||
|
|
||||||
ENTRYPOINT [ "/main" ]
|
ENTRYPOINT [ "/main" ]
|
||||||
|
|
|
@ -21,4 +21,4 @@ models/*
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
compile_commands.json
|
compile_commands.json
|
||||||
Dockerfile
|
Dockerfile
|
||||||
|
|
5
.ecrc
Normal file
5
.ecrc
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
{
|
||||||
|
"Disable": {
|
||||||
|
"IndentSize": true
|
||||||
|
}
|
||||||
|
}
|
16
.editorconfig
Normal file
16
.editorconfig
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
# https://EditorConfig.org
|
||||||
|
|
||||||
|
# Top-most EditorConfig file
|
||||||
|
root = true
|
||||||
|
|
||||||
|
# Unix-style newlines with a newline ending every file, utf-8 charset
|
||||||
|
[*]
|
||||||
|
end_of_line = lf
|
||||||
|
insert_final_newline = true
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
charset = utf-8
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
|
||||||
|
[Makefile]
|
||||||
|
indent_style = tab
|
16
.github/ISSUE_TEMPLATE/custom.md
vendored
16
.github/ISSUE_TEMPLATE/custom.md
vendored
|
@ -22,9 +22,9 @@ Please provide a detailed written description of what you were trying to do, and
|
||||||
|
|
||||||
# Current Behavior
|
# Current Behavior
|
||||||
|
|
||||||
Please provide a detailed written description of what `llama.cpp` did, instead.
|
Please provide a detailed written description of what `llama.cpp` did, instead.
|
||||||
|
|
||||||
# Environment and Context
|
# Environment and Context
|
||||||
|
|
||||||
Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
|
Please provide detailed information about your computer setup. This is important in case the issue is not reproducible except for under certain specific conditions.
|
||||||
|
|
||||||
|
@ -133,7 +133,7 @@ llama_model_load: loading model part 8/8 from './models/65B/ggml-model-q4_0.bin.
|
||||||
llama_model_load: .......................................................................................... done
|
llama_model_load: .......................................................................................... done
|
||||||
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
llama_model_load: model size = 4869.09 MB / num tensors = 723
|
||||||
|
|
||||||
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
|
system_info: n_threads = 16 / 32 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 |
|
||||||
|
|
||||||
main: prompt: 'Please close your issue when it has been answered.'
|
main: prompt: 'Please close your issue when it has been answered.'
|
||||||
main: number of tokens in prompt = 11
|
main: number of tokens in prompt = 11
|
||||||
|
@ -166,14 +166,14 @@ main: total time = 246406.42 ms
|
||||||
|
|
||||||
Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
|
Performance counter stats for './main -m ./models/65B/ggml-model-q4_0.bin -t 16 -n 1024 -p Please close your issue when it has been answered.':
|
||||||
|
|
||||||
3636882.89 msec task-clock # 14.677 CPUs utilized
|
3636882.89 msec task-clock # 14.677 CPUs utilized
|
||||||
13509 context-switches # 3.714 /sec
|
13509 context-switches # 3.714 /sec
|
||||||
2436 cpu-migrations # 0.670 /sec
|
2436 cpu-migrations # 0.670 /sec
|
||||||
10476679 page-faults # 2.881 K/sec
|
10476679 page-faults # 2.881 K/sec
|
||||||
13133115082869 cycles # 3.611 GHz (16.77%)
|
13133115082869 cycles # 3.611 GHz (16.77%)
|
||||||
29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%)
|
29314462753 stalled-cycles-frontend # 0.22% frontend cycles idle (16.76%)
|
||||||
10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%)
|
10294402631459 stalled-cycles-backend # 78.39% backend cycles idle (16.74%)
|
||||||
23479217109614 instructions # 1.79 insn per cycle
|
23479217109614 instructions # 1.79 insn per cycle
|
||||||
# 0.44 stalled cycles per insn (16.76%)
|
# 0.44 stalled cycles per insn (16.76%)
|
||||||
2353072268027 branches # 647.002 M/sec (16.77%)
|
2353072268027 branches # 647.002 M/sec (16.77%)
|
||||||
1998682780 branch-misses # 0.08% of all branches (16.76%)
|
1998682780 branch-misses # 0.08% of all branches (16.76%)
|
||||||
|
|
2
.github/workflows/docker.yml
vendored
2
.github/workflows/docker.yml
vendored
|
@ -60,4 +60,4 @@ jobs:
|
||||||
push: ${{ github.event_name == 'push' }}
|
push: ${{ github.event_name == 'push' }}
|
||||||
platforms: linux/amd64,linux/arm64
|
platforms: linux/amd64,linux/arm64
|
||||||
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
|
||||||
file: ${{ matrix.config.dockerfile }}
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
17
.github/workflows/editorconfig.yml
vendored
Normal file
17
.github/workflows/editorconfig.yml
vendored
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
name: EditorConfig Checker
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
editorconfig:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- uses: editorconfig-checker/action-editorconfig-checker@main
|
||||||
|
- run: editorconfig-checker
|
11
README.md
11
README.md
|
@ -42,6 +42,7 @@ New features will probably be added mostly through community contributions.
|
||||||
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
|
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
|
||||||
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
|
||||||
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
|
- [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
|
||||||
|
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
|
|
||||||
|
@ -242,7 +243,7 @@ There 26 letters in the English Alphabet
|
||||||
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
|
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
|
||||||
> List 5 words that start with "ca".
|
> List 5 words that start with "ca".
|
||||||
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||||
>
|
>
|
||||||
```
|
```
|
||||||
|
|
||||||
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
|
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
|
||||||
|
@ -253,17 +254,17 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||||
convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
|
convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
|
python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
|
||||||
python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
|
python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
|
||||||
```
|
```
|
||||||
|
|
||||||
- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
|
- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
|
||||||
- The original model is saved in the same folder with a suffix `.orig`
|
- The original model is saved in the same folder with a suffix `.orig`
|
||||||
|
|
||||||
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
|
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
|
||||||
|
|
||||||
- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
|
- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
|
||||||
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
|
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
|
||||||
- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
|
- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
|
||||||
- Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
|
- Please verify the sha256 checksums of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
|
||||||
- The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
|
- The following command will verify if you have all possible latest files in your self-installed `./models` subdirectory:
|
||||||
|
@ -283,7 +284,7 @@ convert the model from the old format to the new format with [./migrate-ggml-202
|
||||||
- GPT-3.5 / InstructGPT / ChatGPT:
|
- GPT-3.5 / InstructGPT / ChatGPT:
|
||||||
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
|
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
|
||||||
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
|
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
|
||||||
|
|
||||||
### Perplexity (Measuring model quality)
|
### Perplexity (Measuring model quality)
|
||||||
|
|
||||||
You can use the `perplexity` example to measure perplexity over the given prompt. For more background,
|
You can use the `perplexity` example to measure perplexity over the given prompt. For more background,
|
||||||
|
|
|
@ -19,15 +19,15 @@ GEN_OPTIONS=(--batch_size 1024
|
||||||
--top_p 0.5)
|
--top_p 0.5)
|
||||||
|
|
||||||
if [ -n "$N_THREAD" ]; then
|
if [ -n "$N_THREAD" ]; then
|
||||||
GEN_OPTIONS+=(--threads "$N_THREAD")
|
GEN_OPTIONS+=(--threads "$N_THREAD")
|
||||||
fi
|
fi
|
||||||
|
|
||||||
./main "${GEN_OPTIONS[@]}" \
|
./main "${GEN_OPTIONS[@]}" \
|
||||||
--model "$MODEL" \
|
--model "$MODEL" \
|
||||||
--n_predict "$N_PREDICTS" \
|
--n_predict "$N_PREDICTS" \
|
||||||
--color --interactive \
|
--color --interactive \
|
||||||
--reverse-prompt "${USER_NAME}:" \
|
--reverse-prompt "${USER_NAME}:" \
|
||||||
--prompt "
|
--prompt "
|
||||||
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
|
This is a transcript of a 1000 page, never ending conversation between ${USER_NAME} and the cute and helpful AI assistant ${AI_NAME}. ${AI_NAME} is a girl who is an AI running on the users computer.
|
||||||
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
${AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next.
|
||||||
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
|
${AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help.
|
||||||
|
|
|
@ -25,9 +25,9 @@ extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHand
|
||||||
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
|
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
|
||||||
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
|
extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
|
||||||
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
|
extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
|
||||||
extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
|
extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
|
||||||
const wchar_t * lpWideCharStr, int cchWideChar,
|
const wchar_t * lpWideCharStr, int cchWideChar,
|
||||||
char * lpMultiByteStr, int cbMultiByte,
|
char * lpMultiByteStr, int cbMultiByte,
|
||||||
const char * lpDefaultChar, bool * lpUsedDefaultChar);
|
const char * lpDefaultChar, bool * lpUsedDefaultChar);
|
||||||
#define CP_UTF8 65001
|
#define CP_UTF8 65001
|
||||||
#endif
|
#endif
|
||||||
|
@ -448,10 +448,10 @@ void win32_console_init(bool enable_color) {
|
||||||
|
|
||||||
// Convert a wide Unicode string to an UTF8 string
|
// Convert a wide Unicode string to an UTF8 string
|
||||||
void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
|
void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
|
||||||
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
|
int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
|
||||||
std::string strTo(size_needed, 0);
|
std::string strTo(size_needed, 0);
|
||||||
WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
|
WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
|
||||||
str = strTo;
|
str = strTo;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
# embedding
|
# embedding
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
# main
|
# main
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
|
|
|
@ -1,3 +1,8 @@
|
||||||
|
// Defines sigaction on msys:
|
||||||
|
#ifndef _GNU_SOURCE
|
||||||
|
#define _GNU_SOURCE
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
@ -164,7 +169,7 @@ int main(int argc, char ** argv) {
|
||||||
const auto inp_sfx = ::llama_tokenize(ctx, instruct_suffix, params.instruct_suffix_bos);
|
const auto inp_sfx = ::llama_tokenize(ctx, instruct_suffix, params.instruct_suffix_bos);
|
||||||
|
|
||||||
// enable interactive mode if reverse prompt or interactive start is specified
|
// enable interactive mode if reverse prompt or interactive start is specified
|
||||||
if (params.antiprompt.size() != 0 || params.stopprompt.size() != 0 || params.interactive_start) {
|
if (params.antiprompt.size() != 0 || params.stopprompt.size() != 0 || params.interactive_start) {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
# perplexity
|
# perplexity
|
||||||
|
|
||||||
TODO
|
TODO
|
||||||
|
|
|
@ -5,15 +5,15 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
// usage:
|
// usage:
|
||||||
// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
|
// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
|
||||||
//
|
//
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
if (argc != 4) {
|
if (argc != 4) {
|
||||||
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
|
||||||
fprintf(stderr, " type = 2 - q4_0\n");
|
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
|
||||||
fprintf(stderr, " type = 3 - q4_1\n");
|
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ int main(int argc, char ** argv) {
|
||||||
const std::string fname_inp = argv[1];
|
const std::string fname_inp = argv[1];
|
||||||
const std::string fname_out = argv[2];
|
const std::string fname_out = argv[2];
|
||||||
|
|
||||||
const int itype = atoi(argv[3]);
|
const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
|
||||||
|
|
||||||
const int64_t t_main_start_us = ggml_time_us();
|
const int64_t t_main_start_us = ggml_time_us();
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ int main(int argc, char ** argv) {
|
||||||
{
|
{
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_time_us();
|
||||||
|
|
||||||
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
|
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) {
|
||||||
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
420
ggml.c
420
ggml.c
|
@ -1,4 +1,4 @@
|
||||||
// Defines CLOCK_MONOTONIC and asprintf on Linux
|
// Defines CLOCK_MONOTONIC on Linux
|
||||||
#define _GNU_SOURCE
|
#define _GNU_SOURCE
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
@ -26,14 +26,9 @@
|
||||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined _MSC_VER || defined(__MINGW32__)
|
#if defined(_WIN32)
|
||||||
|
|
||||||
#if !defined(__MINGW32__)
|
|
||||||
#include <Windows.h>
|
|
||||||
#else
|
|
||||||
// ref: https://github.com/ggerganov/whisper.cpp/issues/168
|
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef volatile LONG atomic_int;
|
typedef volatile LONG atomic_int;
|
||||||
typedef atomic_int atomic_bool;
|
typedef atomic_int atomic_bool;
|
||||||
|
@ -55,6 +50,7 @@ typedef HANDLE pthread_t;
|
||||||
|
|
||||||
typedef DWORD thread_ret_t;
|
typedef DWORD thread_ret_t;
|
||||||
static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
|
static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {
|
||||||
|
(void) unused;
|
||||||
HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
|
HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);
|
||||||
if (handle == NULL)
|
if (handle == NULL)
|
||||||
{
|
{
|
||||||
|
@ -66,6 +62,7 @@ static int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void
|
||||||
}
|
}
|
||||||
|
|
||||||
static int pthread_join(pthread_t thread, void* unused) {
|
static int pthread_join(pthread_t thread, void* unused) {
|
||||||
|
(void) unused;
|
||||||
return (int) WaitForSingleObject(thread, INFINITE);
|
return (int) WaitForSingleObject(thread, INFINITE);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -231,12 +228,12 @@ static inline float fp32_from_bits(uint32_t w) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline uint32_t fp32_to_bits(float f) {
|
static inline uint32_t fp32_to_bits(float f) {
|
||||||
union {
|
union {
|
||||||
float as_value;
|
float as_value;
|
||||||
uint32_t as_bits;
|
uint32_t as_bits;
|
||||||
} fp32;
|
} fp32;
|
||||||
fp32.as_value = f;
|
fp32.as_value = f;
|
||||||
return fp32.as_bits;
|
return fp32.as_bits;
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||||
|
@ -599,10 +596,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
|
||||||
for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
|
for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
|
||||||
for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
|
for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
|
||||||
|
|
||||||
// absolute max
|
const float amax = vmaxvq_f32(amaxv[0]);
|
||||||
const float amax = MAX(
|
|
||||||
MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)),
|
|
||||||
MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3)));
|
|
||||||
|
|
||||||
const float d = amax / ((1 << 3) - 1);
|
const float d = amax / ((1 << 3) - 1);
|
||||||
const float id = d ? 1.0f/d : 0.0f;
|
const float id = d ? 1.0f/d : 0.0f;
|
||||||
|
@ -924,7 +918,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int
|
||||||
float32x4_t minv[8];
|
float32x4_t minv[8];
|
||||||
float32x4_t maxv[8];
|
float32x4_t maxv[8];
|
||||||
|
|
||||||
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
|
for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*QK + 4*l);
|
||||||
|
|
||||||
for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
|
for (int l = 0; l < 4; l++) minv[2*l] = vminq_f32(srcv[2*l], srcv[2*l + 1]);
|
||||||
for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
|
for (int l = 0; l < 2; l++) minv[4*l] = vminq_f32(minv[4*l], minv[4*l + 2]);
|
||||||
|
@ -947,7 +941,8 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int
|
||||||
|
|
||||||
for (int l = 0; l < 8; l++) {
|
for (int l = 0; l < 8; l++) {
|
||||||
const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
|
const float32x4_t v = vmulq_n_f32(vsubq_f32(srcv[l], minv0), id);
|
||||||
const int32x4_t vi = vcvtq_s32_f32(v);
|
const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(0.5f)); // needed to round to nearest
|
||||||
|
const int32x4_t vi = vcvtq_s32_f32(vf);
|
||||||
|
|
||||||
y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
|
y[i].qs[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);
|
||||||
y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
|
y[i].qs[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);
|
||||||
|
@ -1886,7 +1881,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
||||||
sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
|
sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
|
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
|
||||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
|
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
|
||||||
|
|
||||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
|
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
|
||||||
|
@ -1951,7 +1946,7 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
||||||
// Initialize accumulator with zeros
|
// Initialize accumulator with zeros
|
||||||
__m256 acc = _mm256_setzero_ps();
|
__m256 acc = _mm256_setzero_ps();
|
||||||
|
|
||||||
/* Prepare the constants we will need during execution */
|
/* Prepare the constants we will need during execution */
|
||||||
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
const __m256i lowMask = _mm256_set1_epi8( 0xF );
|
||||||
const __m256i offset_8 = _mm256_set1_epi16( 8 );
|
const __m256i offset_8 = _mm256_set1_epi16( 8 );
|
||||||
|
|
||||||
|
@ -1961,61 +1956,59 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
||||||
|
|
||||||
// Main loop
|
// Main loop
|
||||||
for (int i = 0; i < nb; i+=UNROLL_COUNT) {
|
for (int i = 0; i < nb; i+=UNROLL_COUNT) {
|
||||||
|
// This loop will be unrolled by the compiler
|
||||||
// This loop will be unrolled by the compiler
|
|
||||||
for (int u=0;u<UNROLL_COUNT;u++) {
|
for (int u=0;u<UNROLL_COUNT;u++) {
|
||||||
/* Compute combined scale for the block */
|
/* Compute combined scale for the block */
|
||||||
const __m256 scale = _mm256_mul_ps(
|
const __m256 scale = _mm256_mul_ps(
|
||||||
_mm256_broadcast_ss( &x[i+u].d ),
|
_mm256_broadcast_ss( &x[i+u].d ),
|
||||||
_mm256_broadcast_ss( &y[i+u].d ) );
|
_mm256_broadcast_ss( &y[i+u].d ) );
|
||||||
|
|
||||||
/* get input from x
|
/* get input from x
|
||||||
Input: 32 Nibbles (16 bytes) at *x[i+u]
|
Input: 32 Nibbles (16 bytes) at *x[i+u]
|
||||||
Output: 2 vectors with 16 values of type int16_t (x_high_q, x_low_q) */
|
Output: 2 vectors with 16 values of type int16_t (x_high_q, x_low_q) */
|
||||||
|
|
||||||
/* Load 16 bytes from memory */
|
/* Load 16 bytes from memory */
|
||||||
const __m128i tmp_x = _mm_loadu_si128( ( const __m128i* ) x[i+u].qs);
|
const __m128i tmp_x = _mm_loadu_si128( ( const __m128i* ) x[i+u].qs);
|
||||||
/* Expand bytes into uint16_t values */
|
/* Expand bytes into uint16_t values */
|
||||||
const __m256i bytes_x = _mm256_cvtepu8_epi16(tmp_x);
|
const __m256i bytes_x = _mm256_cvtepu8_epi16(tmp_x);
|
||||||
/* Unpack values into individual bytes */
|
/* Unpack values into individual bytes */
|
||||||
__m256i x_low_q = _mm256_and_si256( lowMask, bytes_x );
|
__m256i x_low_q = _mm256_and_si256( lowMask, bytes_x );
|
||||||
const __m256i pre_shift_x_high_q = _mm256_andnot_si256( lowMask, bytes_x );
|
const __m256i pre_shift_x_high_q = _mm256_andnot_si256( lowMask, bytes_x );
|
||||||
__m256i x_high_q = _mm256_srli_epi16( pre_shift_x_high_q, 4 );
|
__m256i x_high_q = _mm256_srli_epi16( pre_shift_x_high_q, 4 );
|
||||||
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
|
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
|
||||||
x_high_q = _mm256_sub_epi16( x_high_q, offset_8 );
|
x_high_q = _mm256_sub_epi16( x_high_q, offset_8 );
|
||||||
x_low_q = _mm256_sub_epi16( x_low_q, offset_8 );
|
x_low_q = _mm256_sub_epi16( x_low_q, offset_8 );
|
||||||
|
|
||||||
/* get input from y
|
/* get input from y
|
||||||
Input: 32 Nibbles (16 bytes) at *y[i+u]
|
Input: 32 Nibbles (16 bytes) at *y[i+u]
|
||||||
Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */
|
Output: 2 vectors with 16 values of type int16_t (y_high_q, y_low_q) */
|
||||||
|
|
||||||
/* Load 16 bytes from memory */
|
/* Load 16 bytes from memory */
|
||||||
const __m128i tmp_y = _mm_loadu_si128( (const __m128i* ) y[i+u].qs);
|
const __m128i tmp_y = _mm_loadu_si128( (const __m128i* ) y[i+u].qs);
|
||||||
/* Expand bytes into uint16_t values */
|
/* Expand bytes into uint16_t values */
|
||||||
const __m256i bytes_y = _mm256_cvtepu8_epi16(tmp_y);
|
const __m256i bytes_y = _mm256_cvtepu8_epi16(tmp_y);
|
||||||
/* Unpack values into individual bytes */
|
/* Unpack values into individual bytes */
|
||||||
const __m256i pre_shift_y_high_q = _mm256_andnot_si256( lowMask, bytes_y );
|
const __m256i pre_shift_y_high_q = _mm256_andnot_si256( lowMask, bytes_y );
|
||||||
__m256i y_high_q = _mm256_srli_epi16( pre_shift_y_high_q, 4 );
|
__m256i y_high_q = _mm256_srli_epi16( pre_shift_y_high_q, 4 );
|
||||||
__m256i y_low_q = _mm256_and_si256( lowMask, bytes_y );
|
__m256i y_low_q = _mm256_and_si256( lowMask, bytes_y );
|
||||||
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
|
/* Now we have two vectors with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. */
|
||||||
y_high_q = _mm256_sub_epi16( y_high_q, offset_8 );
|
y_high_q = _mm256_sub_epi16( y_high_q, offset_8 );
|
||||||
y_low_q = _mm256_sub_epi16( y_low_q, offset_8 );
|
y_low_q = _mm256_sub_epi16( y_low_q, offset_8 );
|
||||||
|
|
||||||
/* Compute products of int16_t integers, add pairwise, store as int32_t */
|
/* Compute products of int16_t integers, add pairwise, store as int32_t */
|
||||||
__m256i xy_high_q = _mm256_madd_epi16( x_high_q, y_high_q );
|
__m256i xy_high_q = _mm256_madd_epi16( x_high_q, y_high_q );
|
||||||
__m256i xy_low_q = _mm256_madd_epi16( x_low_q, y_low_q );
|
__m256i xy_low_q = _mm256_madd_epi16( x_low_q, y_low_q );
|
||||||
|
|
||||||
/* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */
|
/* Accumulate the products of int32_t integers -> we now have a vector of 8 int_32t */
|
||||||
__m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q );
|
__m256i xy_q = _mm256_add_epi32( xy_high_q, xy_low_q );
|
||||||
|
|
||||||
/* Convert to vectore of 8 int32_t to 8 floats */
|
/* Convert to vectore of 8 int32_t to 8 floats */
|
||||||
__m256 q = _mm256_cvtepi32_ps( xy_q );
|
__m256 q = _mm256_cvtepi32_ps( xy_q );
|
||||||
|
|
||||||
/* Multiply q with scale and accumulate */
|
/* Multiply q with scale and accumulate */
|
||||||
acc = _mm256_fmadd_ps( scale, q, acc );
|
acc = _mm256_fmadd_ps( scale, q, acc );
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Return horizontal sum of the acc vector
|
// Return horizontal sum of the acc vector
|
||||||
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
__m128 res = _mm256_extractf128_ps( acc, 1 );
|
||||||
|
@ -2076,18 +2069,18 @@ static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * rest
|
||||||
float sum1 = 0.0f;
|
float sum1 = 0.0f;
|
||||||
|
|
||||||
for (int i = 0; i < nb; i += 2) {
|
for (int i = 0; i < nb; i += 2) {
|
||||||
const block_q4_0 * restrict x0 = &px[i + 0];
|
const block_q4_0 * restrict x0 = &x[i + 0];
|
||||||
const block_q4_0 * restrict y0 = &py[i + 0];
|
const block_q4_0 * restrict y0 = &y[i + 0];
|
||||||
const block_q4_0 * restrict x1 = &px[i + 1];
|
const block_q4_0 * restrict x1 = &x[i + 1];
|
||||||
const block_q4_0 * restrict y1 = &py[i + 1];
|
const block_q4_0 * restrict y1 = &y[i + 1];
|
||||||
|
|
||||||
const v128_t m4b = wasm_u8x16_splat(0xf);
|
const v128_t m4b = wasm_u8x16_splat(0xf);
|
||||||
const v128_t s8b = wasm_i8x16_splat(0x8);
|
const v128_t s8b = wasm_i8x16_splat(0x8);
|
||||||
|
|
||||||
const v128_t v0_0 = wasm_v128_load(x0.qs);
|
const v128_t v0_0 = wasm_v128_load(x0->qs);
|
||||||
const v128_t v0_1 = wasm_v128_load(y0.qs);
|
const v128_t v0_1 = wasm_v128_load(y0->qs);
|
||||||
const v128_t v1_0 = wasm_v128_load(x1.qs);
|
const v128_t v1_0 = wasm_v128_load(x1->qs);
|
||||||
const v128_t v1_1 = wasm_v128_load(y1.qs);
|
const v128_t v1_1 = wasm_v128_load(y1->qs);
|
||||||
|
|
||||||
// 4-bit -> 8-bit
|
// 4-bit -> 8-bit
|
||||||
const v128_t v0_0l = wasm_v128_and(v0_0, m4b);
|
const v128_t v0_0l = wasm_v128_and(v0_0, m4b);
|
||||||
|
@ -2567,29 +2560,26 @@ inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x
|
||||||
//
|
//
|
||||||
|
|
||||||
static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
|
||||||
QK,
|
[GGML_TYPE_F32] = 1,
|
||||||
QK,
|
[GGML_TYPE_F16] = 1,
|
||||||
1,
|
[GGML_TYPE_Q4_0] = QK,
|
||||||
1,
|
[GGML_TYPE_Q4_1] = QK,
|
||||||
1,
|
[GGML_TYPE_I8] = 1,
|
||||||
1,
|
[GGML_TYPE_I16] = 1,
|
||||||
1,
|
[GGML_TYPE_I32] = 1,
|
||||||
};
|
};
|
||||||
|
static_assert(GGML_TYPE_COUNT == 7, "GGML_BLCK_SIZE is outdated");
|
||||||
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
|
|
||||||
|
|
||||||
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
|
||||||
sizeof(block_q4_0),
|
[GGML_TYPE_F32] = sizeof(float),
|
||||||
sizeof(block_q4_1),
|
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
|
||||||
sizeof(int8_t ),
|
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
|
||||||
sizeof(int16_t),
|
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
|
||||||
sizeof(int32_t),
|
[GGML_TYPE_I8] = sizeof(int8_t),
|
||||||
sizeof(ggml_fp16_t),
|
[GGML_TYPE_I16] = sizeof(int16_t),
|
||||||
sizeof(float ),
|
[GGML_TYPE_I32] = sizeof(int32_t),
|
||||||
};
|
};
|
||||||
|
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_SIZE is outdated");
|
||||||
// don't forget to update the array above when adding new types
|
|
||||||
static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5");
|
|
||||||
|
|
||||||
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
||||||
"NONE",
|
"NONE",
|
||||||
|
@ -2618,6 +2608,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
||||||
|
|
||||||
"SCALE",
|
"SCALE",
|
||||||
"CPY",
|
"CPY",
|
||||||
|
"CONT",
|
||||||
"RESHAPE",
|
"RESHAPE",
|
||||||
"VIEW",
|
"VIEW",
|
||||||
"PERMUTE",
|
"PERMUTE",
|
||||||
|
@ -2633,7 +2624,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
|
||||||
"FLASH_FF",
|
"FLASH_FF",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35");
|
static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36");
|
||||||
|
|
||||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"none",
|
"none",
|
||||||
|
@ -2662,6 +2653,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
|
|
||||||
"x*v",
|
"x*v",
|
||||||
"x-\\>y",
|
"x-\\>y",
|
||||||
|
"cont(x)",
|
||||||
"reshape(x)",
|
"reshape(x)",
|
||||||
"view(x)",
|
"view(x)",
|
||||||
"permute(x)",
|
"permute(x)",
|
||||||
|
@ -2677,7 +2669,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||||
"flash_ff(x)",
|
"flash_ff(x)",
|
||||||
};
|
};
|
||||||
|
|
||||||
static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35");
|
static_assert(GGML_OP_COUNT == 36, "GGML_OP_COUNT != 36");
|
||||||
|
|
||||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||||
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
|
||||||
|
@ -4310,6 +4302,41 @@ struct ggml_tensor * ggml_cpy_inplace(
|
||||||
return ggml_cpy_impl(ctx, a, b, true);
|
return ggml_cpy_impl(ctx, a, b, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_cont
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_cont_impl(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
bool inplace) {
|
||||||
|
bool is_node = false;
|
||||||
|
|
||||||
|
if (!inplace && a->grad) {
|
||||||
|
GGML_ASSERT(false); // TODO: implement backward
|
||||||
|
is_node = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
|
result->op = GGML_OP_CONT;
|
||||||
|
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||||
|
result->src0 = a;
|
||||||
|
result->src1 = NULL;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_cont(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a) {
|
||||||
|
return ggml_cont_impl(ctx, a, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * ggml_cont_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a) {
|
||||||
|
return ggml_cont_impl(ctx, a, true);
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_reshape
|
// ggml_reshape
|
||||||
|
|
||||||
struct ggml_tensor * ggml_reshape(
|
struct ggml_tensor * ggml_reshape(
|
||||||
|
@ -4852,6 +4879,85 @@ static void ggml_compute_forward_dup_f16(
|
||||||
|
|
||||||
// TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
|
// TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy
|
||||||
|
|
||||||
|
if (ggml_is_contiguous(dst)) {
|
||||||
|
if (src0->nb[0] == sizeof(ggml_fp16_t)) {
|
||||||
|
if (dst->type == GGML_TYPE_F16) {
|
||||||
|
size_t id = 0;
|
||||||
|
const size_t rs = ne00*nb00;
|
||||||
|
|
||||||
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int i01 = 0; i01 < ne01; i01++) {
|
||||||
|
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
||||||
|
char * dst_ptr = (char *) dst->data + id*rs;
|
||||||
|
|
||||||
|
memcpy(dst_ptr, src0_ptr, rs);
|
||||||
|
|
||||||
|
id++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (dst->type == GGML_TYPE_F32) {
|
||||||
|
size_t id = 0;
|
||||||
|
float * dst_ptr = (float *) dst->data;
|
||||||
|
|
||||||
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int i01 = 0; i01 < ne01; i01++) {
|
||||||
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
|
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
|
|
||||||
|
dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
|
||||||
|
id++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false); // TODO: implement
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//printf("%s: this is not optimal - fix me\n", __func__);
|
||||||
|
|
||||||
|
if (dst->type == GGML_TYPE_F32) {
|
||||||
|
size_t id = 0;
|
||||||
|
float * dst_ptr = (float *) dst->data;
|
||||||
|
|
||||||
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int i01 = 0; i01 < ne01; i01++) {
|
||||||
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
|
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
|
|
||||||
|
dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);
|
||||||
|
id++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (dst->type == GGML_TYPE_F16) {
|
||||||
|
size_t id = 0;
|
||||||
|
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
||||||
|
|
||||||
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int i01 = 0; i01 < ne01; i01++) {
|
||||||
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
|
const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
|
|
||||||
|
dst_ptr[id] = *src0_ptr;
|
||||||
|
id++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false); // TODO: implement
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// dst counters
|
// dst counters
|
||||||
int64_t i10 = 0;
|
int64_t i10 = 0;
|
||||||
int64_t i11 = 0;
|
int64_t i11 = 0;
|
||||||
|
@ -4946,6 +5052,105 @@ static void ggml_compute_forward_dup_f32(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (src0->type == dst->type &&
|
||||||
|
src0->ne[0] == dst->ne[0] &&
|
||||||
|
src0->nb[0] == GGML_TYPE_SIZE[src0->type] && dst->nb[0] == GGML_TYPE_SIZE[dst->type]) {
|
||||||
|
// copy by rows
|
||||||
|
const size_t rs = ne00*nb00;
|
||||||
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int64_t i01 = 0; i01 < ne01; i01++) {
|
||||||
|
memcpy(
|
||||||
|
((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
|
||||||
|
((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03),
|
||||||
|
rs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ggml_is_contiguous(dst)) {
|
||||||
|
// TODO: simplify
|
||||||
|
if (src0->nb[0] == sizeof(float)) {
|
||||||
|
if (dst->type == GGML_TYPE_F32) {
|
||||||
|
size_t id = 0;
|
||||||
|
const size_t rs = ne00*nb00;
|
||||||
|
|
||||||
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int i01 = 0; i01 < ne01; i01++) {
|
||||||
|
const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
|
||||||
|
char * dst_ptr = (char *) dst->data + id*rs;
|
||||||
|
|
||||||
|
memcpy(dst_ptr, src0_ptr, rs);
|
||||||
|
|
||||||
|
id++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (dst->type == GGML_TYPE_F16) {
|
||||||
|
size_t id = 0;
|
||||||
|
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
||||||
|
|
||||||
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int i01 = 0; i01 < ne01; i01++) {
|
||||||
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
|
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
|
|
||||||
|
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
||||||
|
id++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false); // TODO: implement
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
//printf("%s: this is not optimal - fix me\n", __func__);
|
||||||
|
|
||||||
|
if (dst->type == GGML_TYPE_F32) {
|
||||||
|
size_t id = 0;
|
||||||
|
float * dst_ptr = (float *) dst->data;
|
||||||
|
|
||||||
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int i01 = 0; i01 < ne01; i01++) {
|
||||||
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
|
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
|
|
||||||
|
dst_ptr[id] = *src0_ptr;
|
||||||
|
id++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (dst->type == GGML_TYPE_F16) {
|
||||||
|
size_t id = 0;
|
||||||
|
ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
|
||||||
|
|
||||||
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
for (int i02 = 0; i02 < ne02; i02++) {
|
||||||
|
for (int i01 = 0; i01 < ne01; i01++) {
|
||||||
|
for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
|
const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||||
|
|
||||||
|
dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
|
||||||
|
id++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(false); // TODO: implement
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// dst counters
|
// dst counters
|
||||||
int64_t i10 = 0;
|
int64_t i10 = 0;
|
||||||
int64_t i11 = 0;
|
int64_t i11 = 0;
|
||||||
|
@ -5066,14 +5271,18 @@ static void ggml_compute_forward_add_f32(
|
||||||
GGML_ASSERT(nb00 == sizeof(float));
|
GGML_ASSERT(nb00 == sizeof(float));
|
||||||
|
|
||||||
if (nb10 == sizeof(float)) {
|
if (nb10 == sizeof(float)) {
|
||||||
const int j0 = (n/nth)*ith;
|
for (int j = ith; j < n; j += nth) {
|
||||||
const int j1 = ith == nth - 1 ? n : (n/nth)*(ith + 1);
|
#ifdef GGML_USE_ACCELERATE
|
||||||
|
vDSP_vadd(
|
||||||
for (int j = j0; j < j1; j++) {
|
(float *) ((char *) src0->data + j*nb01), 1,
|
||||||
|
(float *) ((char *) src1->data + j*nb11), 1,
|
||||||
|
(float *) ((char *) dst->data + j*nb1), 1, nc);
|
||||||
|
#else
|
||||||
ggml_vec_add_f32(nc,
|
ggml_vec_add_f32(nc,
|
||||||
(float *) ((char *) dst->data + j*nb1),
|
(float *) ((char *) dst->data + j*nb1),
|
||||||
(float *) ((char *) src0->data + j*nb01),
|
(float *) ((char *) src0->data + j*nb01),
|
||||||
(float *) ((char *) src1->data + j*nb11));
|
(float *) ((char *) src1->data + j*nb11));
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// src1 is not contiguous
|
// src1 is not contiguous
|
||||||
|
@ -6821,6 +7030,15 @@ static void ggml_compute_forward_cpy(
|
||||||
ggml_compute_forward_dup(params, src0, dst);
|
ggml_compute_forward_dup(params, src0, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ggml_compute_forward_cont
|
||||||
|
|
||||||
|
static void ggml_compute_forward_cont(
|
||||||
|
const struct ggml_compute_params * params,
|
||||||
|
const struct ggml_tensor * src0,
|
||||||
|
struct ggml_tensor * dst) {
|
||||||
|
ggml_compute_forward_dup(params, src0, dst);
|
||||||
|
}
|
||||||
|
|
||||||
// ggml_compute_forward_reshape
|
// ggml_compute_forward_reshape
|
||||||
|
|
||||||
static void ggml_compute_forward_reshape(
|
static void ggml_compute_forward_reshape(
|
||||||
|
@ -8651,6 +8869,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
{
|
{
|
||||||
ggml_compute_forward_cpy(params, tensor->src0, tensor);
|
ggml_compute_forward_cpy(params, tensor->src0, tensor);
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_CONT:
|
||||||
|
{
|
||||||
|
ggml_compute_forward_cont(params, tensor->src0, tensor);
|
||||||
|
} break;
|
||||||
case GGML_OP_RESHAPE:
|
case GGML_OP_RESHAPE:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_reshape(params, tensor->src0, tensor);
|
ggml_compute_forward_reshape(params, tensor->src0, tensor);
|
||||||
|
@ -8895,8 +9117,9 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
src1->grad =
|
src1->grad =
|
||||||
ggml_add_impl(ctx,
|
ggml_add_impl(ctx,
|
||||||
src1->grad,
|
src1->grad,
|
||||||
// TODO: fix transpose, the node will break the graph connections
|
ggml_mul_mat(ctx,
|
||||||
ggml_mul_mat(ctx, ggml_transpose(ctx, src0), tensor->grad),
|
ggml_cont(ctx, ggml_transpose(ctx, src0)),
|
||||||
|
tensor->grad),
|
||||||
inplace);
|
inplace);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
@ -8908,6 +9131,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false); // TODO: not implemented
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
} break;
|
} break;
|
||||||
|
case GGML_OP_CONT:
|
||||||
|
{
|
||||||
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
|
} break;
|
||||||
case GGML_OP_RESHAPE:
|
case GGML_OP_RESHAPE:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false); // TODO: not implemented
|
GGML_ASSERT(false); // TODO: not implemented
|
||||||
|
@ -9362,6 +9589,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
|
||||||
node->n_tasks = n_threads;
|
node->n_tasks = n_threads;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CPY:
|
case GGML_OP_CPY:
|
||||||
|
case GGML_OP_CONT:
|
||||||
case GGML_OP_RESHAPE:
|
case GGML_OP_RESHAPE:
|
||||||
case GGML_OP_VIEW:
|
case GGML_OP_VIEW:
|
||||||
case GGML_OP_PERMUTE:
|
case GGML_OP_PERMUTE:
|
||||||
|
|
15
ggml.h
15
ggml.h
|
@ -198,13 +198,14 @@ struct ggml_object;
|
||||||
struct ggml_context;
|
struct ggml_context;
|
||||||
|
|
||||||
enum ggml_type {
|
enum ggml_type {
|
||||||
GGML_TYPE_Q4_0,
|
// explicitly numbered values are used in llama.cpp files
|
||||||
GGML_TYPE_Q4_1,
|
GGML_TYPE_F32 = 0,
|
||||||
|
GGML_TYPE_F16 = 1,
|
||||||
|
GGML_TYPE_Q4_0 = 2,
|
||||||
|
GGML_TYPE_Q4_1 = 3,
|
||||||
GGML_TYPE_I8,
|
GGML_TYPE_I8,
|
||||||
GGML_TYPE_I16,
|
GGML_TYPE_I16,
|
||||||
GGML_TYPE_I32,
|
GGML_TYPE_I32,
|
||||||
GGML_TYPE_F16,
|
|
||||||
GGML_TYPE_F32,
|
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -236,6 +237,7 @@ enum ggml_op {
|
||||||
|
|
||||||
GGML_OP_SCALE,
|
GGML_OP_SCALE,
|
||||||
GGML_OP_CPY,
|
GGML_OP_CPY,
|
||||||
|
GGML_OP_CONT,
|
||||||
GGML_OP_RESHAPE,
|
GGML_OP_RESHAPE,
|
||||||
GGML_OP_VIEW,
|
GGML_OP_VIEW,
|
||||||
GGML_OP_PERMUTE,
|
GGML_OP_PERMUTE,
|
||||||
|
@ -525,6 +527,11 @@ struct ggml_tensor * ggml_cpy(
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// make contiguous
|
||||||
|
struct ggml_tensor * ggml_cont(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// return view(a), b specifies the new shape
|
// return view(a), b specifies the new shape
|
||||||
// TODO: when we start computing gradient, make a copy instead of view
|
// TODO: when we start computing gradient, make a copy instead of view
|
||||||
struct ggml_tensor * ggml_reshape(
|
struct ggml_tensor * ggml_reshape(
|
||||||
|
|
72
llama.cpp
72
llama.cpp
|
@ -1,3 +1,8 @@
|
||||||
|
// Defines fileno on msys:
|
||||||
|
#ifndef _GNU_SOURCE
|
||||||
|
#define _GNU_SOURCE
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "llama_util.h"
|
#include "llama_util.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "llama_internal.h"
|
#include "llama_internal.h"
|
||||||
|
@ -77,7 +82,7 @@ struct llama_hparams {
|
||||||
uint32_t n_head = 32;
|
uint32_t n_head = 32;
|
||||||
uint32_t n_layer = 32;
|
uint32_t n_layer = 32;
|
||||||
uint32_t n_rot = 64;
|
uint32_t n_rot = 64;
|
||||||
uint32_t f16 = 1;
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||||
|
|
||||||
bool operator!=(const llama_hparams & other) const {
|
bool operator!=(const llama_hparams & other) const {
|
||||||
return memcmp(this, &other, sizeof(llama_hparams));
|
return memcmp(this, &other, sizeof(llama_hparams));
|
||||||
|
@ -427,7 +432,7 @@ struct llama_file_loader {
|
||||||
hparams.n_head = file.read_u32();
|
hparams.n_head = file.read_u32();
|
||||||
hparams.n_layer = file.read_u32();
|
hparams.n_layer = file.read_u32();
|
||||||
hparams.n_rot = file.read_u32();
|
hparams.n_rot = file.read_u32();
|
||||||
hparams.f16 = file.read_u32();
|
hparams.ftype = (enum llama_ftype) file.read_u32();
|
||||||
}
|
}
|
||||||
void read_vocab() {
|
void read_vocab() {
|
||||||
vocab.id_to_token.resize(hparams.n_vocab);
|
vocab.id_to_token.resize(hparams.n_vocab);
|
||||||
|
@ -453,20 +458,21 @@ struct llama_file_loader {
|
||||||
llama_load_tensor_shard shard;
|
llama_load_tensor_shard shard;
|
||||||
uint32_t n_dims = file.read_u32();
|
uint32_t n_dims = file.read_u32();
|
||||||
uint32_t name_len = file.read_u32();
|
uint32_t name_len = file.read_u32();
|
||||||
uint32_t ftype = file.read_u32();
|
shard.type = (enum ggml_type) file.read_u32();
|
||||||
shard.ne.resize(n_dims);
|
shard.ne.resize(n_dims);
|
||||||
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
file.read_raw(shard.ne.data(), sizeof(shard.ne[0]) * n_dims);
|
||||||
std::string name = file.read_string(name_len);
|
std::string name = file.read_string(name_len);
|
||||||
if (n_dims < 1 || n_dims > 2) {
|
if (n_dims < 1 || n_dims > 2) {
|
||||||
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
throw format("llama.cpp: tensor '%s' should not be %u-dimensional", name.c_str(), n_dims);
|
||||||
}
|
}
|
||||||
switch (ftype) {
|
switch (shard.type) {
|
||||||
case 0: shard.type = GGML_TYPE_F32; break;
|
case GGML_TYPE_F32:
|
||||||
case 1: shard.type = GGML_TYPE_F16; break;
|
case GGML_TYPE_F16:
|
||||||
case 2: shard.type = GGML_TYPE_Q4_0; break;
|
case GGML_TYPE_Q4_0:
|
||||||
case 3: shard.type = GGML_TYPE_Q4_1; break;
|
case GGML_TYPE_Q4_1:
|
||||||
|
break;
|
||||||
default: {
|
default: {
|
||||||
throw format("unrecognized ftype %u\n", ftype);
|
throw format("unrecognized tensor type %u\n", shard.type);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -497,18 +503,18 @@ struct llama_file_loader {
|
||||||
struct llama_file_saver {
|
struct llama_file_saver {
|
||||||
llama_file file;
|
llama_file file;
|
||||||
llama_file_loader * any_file_loader;
|
llama_file_loader * any_file_loader;
|
||||||
llama_file_saver(const char * fname, llama_file_loader * any_file_loader, uint32_t new_f16)
|
llama_file_saver(const char * fname, llama_file_loader * any_file_loader, enum llama_ftype new_ftype)
|
||||||
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
: file(fname, "wb"), any_file_loader(any_file_loader) {
|
||||||
fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
|
fprintf(stderr, "llama.cpp: saving model to %s\n", fname);
|
||||||
write_magic();
|
write_magic();
|
||||||
write_hparams(new_f16);
|
write_hparams(new_ftype);
|
||||||
write_vocab();
|
write_vocab();
|
||||||
}
|
}
|
||||||
void write_magic() {
|
void write_magic() {
|
||||||
file.write_u32('ggjt'); // magic
|
file.write_u32('ggjt'); // magic
|
||||||
file.write_u32(1); // version
|
file.write_u32(1); // version
|
||||||
}
|
}
|
||||||
void write_hparams(uint32_t new_f16) {
|
void write_hparams(enum llama_ftype new_ftype) {
|
||||||
const llama_hparams & hparams = any_file_loader->hparams;
|
const llama_hparams & hparams = any_file_loader->hparams;
|
||||||
file.write_u32(hparams.n_vocab);
|
file.write_u32(hparams.n_vocab);
|
||||||
file.write_u32(hparams.n_embd);
|
file.write_u32(hparams.n_embd);
|
||||||
|
@ -516,7 +522,7 @@ struct llama_file_saver {
|
||||||
file.write_u32(hparams.n_head);
|
file.write_u32(hparams.n_head);
|
||||||
file.write_u32(hparams.n_layer);
|
file.write_u32(hparams.n_layer);
|
||||||
file.write_u32(hparams.n_rot);
|
file.write_u32(hparams.n_rot);
|
||||||
file.write_u32(new_f16);
|
file.write_u32(new_ftype);
|
||||||
}
|
}
|
||||||
void write_vocab() {
|
void write_vocab() {
|
||||||
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
if (any_file_loader->file_version == LLAMA_FILE_VERSION_GGML) {
|
||||||
|
@ -531,17 +537,17 @@ struct llama_file_saver {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
|
void write_tensor(llama_load_tensor & tensor, enum ggml_type new_type, const void * new_data, size_t new_size) {
|
||||||
uint32_t ftype;
|
|
||||||
switch (new_type) {
|
switch (new_type) {
|
||||||
case GGML_TYPE_F32: ftype = 0; break;
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_F16: ftype = 1; break;
|
case GGML_TYPE_F16:
|
||||||
case GGML_TYPE_Q4_0: ftype = 2; break;
|
case GGML_TYPE_Q4_0:
|
||||||
case GGML_TYPE_Q4_1: ftype = 3; break;
|
case GGML_TYPE_Q4_1:
|
||||||
|
break;
|
||||||
default: LLAMA_ASSERT(false);
|
default: LLAMA_ASSERT(false);
|
||||||
}
|
}
|
||||||
file.write_u32((uint32_t) tensor.ne.size());
|
file.write_u32((uint32_t) tensor.ne.size());
|
||||||
file.write_u32((uint32_t) tensor.name.size());
|
file.write_u32((uint32_t) tensor.name.size());
|
||||||
file.write_u32(ftype);
|
file.write_u32(new_type);
|
||||||
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
|
||||||
file.write_raw(tensor.name.data(), tensor.name.size());
|
file.write_raw(tensor.name.data(), tensor.name.size());
|
||||||
file.seek(-file.tell() & 31, SEEK_CUR);
|
file.seek(-file.tell() & 31, SEEK_CUR);
|
||||||
|
@ -815,6 +821,16 @@ static const char *llama_file_version_name(llama_file_version version) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static const char *llama_ftype_name(enum llama_ftype ftype) {
|
||||||
|
switch (ftype) {
|
||||||
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_F16: return "mostly F16";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return "mostly Q4_0";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "mostly Q4_1";
|
||||||
|
default: LLAMA_ASSERT(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static const char *llama_model_type_name(e_model type) {
|
static const char *llama_model_type_name(e_model type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case MODEL_7B: return "7B";
|
case MODEL_7B: return "7B";
|
||||||
|
@ -867,7 +883,7 @@ static void llama_model_load_internal(
|
||||||
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head);
|
||||||
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer);
|
||||||
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot);
|
||||||
fprintf(stderr, "%s: f16 = %u\n", __func__, hparams.f16);
|
fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype));
|
||||||
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff);
|
||||||
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
fprintf(stderr, "%s: n_parts = %zu\n", __func__, ml->file_loaders.size());
|
||||||
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type));
|
||||||
|
@ -1539,17 +1555,17 @@ static llama_vocab::id llama_sample_top_p_top_k(
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
|
||||||
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
|
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
|
||||||
ggml_type quantized_type;
|
ggml_type quantized_type;
|
||||||
switch (itype) {
|
switch (ftype) {
|
||||||
case 2: quantized_type = GGML_TYPE_Q4_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
|
||||||
case 3: quantized_type = GGML_TYPE_Q4_1; break;
|
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
|
||||||
default: throw format("invalid quantization type %d\n", itype);
|
default: throw format("invalid output file type %d\n", ftype);
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
|
||||||
/*vocab_only*/ false));
|
/*vocab_only*/ false));
|
||||||
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), (uint32_t) itype);
|
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
|
||||||
|
|
||||||
size_t total_size_org = 0;
|
size_t total_size_org = 0;
|
||||||
size_t total_size_new = 0;
|
size_t total_size_new = 0;
|
||||||
|
@ -1740,9 +1756,9 @@ void llama_free(struct llama_context * ctx) {
|
||||||
int llama_model_quantize(
|
int llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
const char * fname_out,
|
const char * fname_out,
|
||||||
int itype) {
|
enum llama_ftype ftype) {
|
||||||
try {
|
try {
|
||||||
llama_model_quantize_internal(fname_inp, fname_out, itype);
|
llama_model_quantize_internal(fname_inp, fname_out, ftype);
|
||||||
return 0;
|
return 0;
|
||||||
} catch (const std::string & err) {
|
} catch (const std::string & err) {
|
||||||
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
|
||||||
|
|
10
llama.h
10
llama.h
|
@ -65,6 +65,14 @@ extern "C" {
|
||||||
void * progress_callback_user_data;
|
void * progress_callback_user_data;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// model file types
|
||||||
|
enum llama_ftype {
|
||||||
|
LLAMA_FTYPE_ALL_F32 = 0,
|
||||||
|
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
|
||||||
|
};
|
||||||
|
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||||
|
|
||||||
LLAMA_API bool llama_mmap_supported();
|
LLAMA_API bool llama_mmap_supported();
|
||||||
|
@ -85,7 +93,7 @@ extern "C" {
|
||||||
LLAMA_API int llama_model_quantize(
|
LLAMA_API int llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
const char * fname_out,
|
const char * fname_out,
|
||||||
int itype);
|
enum llama_ftype ftype);
|
||||||
|
|
||||||
// Returns the KV cache that will contain the context for the
|
// Returns the KV cache that will contain the context for the
|
||||||
// ongoing prediction with the model.
|
// ongoing prediction with the model.
|
||||||
|
|
12
llama_util.h
12
llama_util.h
|
@ -26,7 +26,9 @@
|
||||||
|
|
||||||
#if defined(_WIN32)
|
#if defined(_WIN32)
|
||||||
#define WIN32_LEAN_AND_MEAN
|
#define WIN32_LEAN_AND_MEAN
|
||||||
#define NOMINMAX
|
#ifndef NOMINMAX
|
||||||
|
#define NOMINMAX
|
||||||
|
#endif
|
||||||
#include <windows.h>
|
#include <windows.h>
|
||||||
#include <io.h>
|
#include <io.h>
|
||||||
#include <stdio.h> // for _fseeki64
|
#include <stdio.h> // for _fseeki64
|
||||||
|
@ -209,6 +211,7 @@ struct llama_mmap {
|
||||||
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||||
// Advise the kernel to preload the mapped memory
|
// Advise the kernel to preload the mapped memory
|
||||||
WIN32_MEMORY_RANGE_ENTRY range;
|
WIN32_MEMORY_RANGE_ENTRY range;
|
||||||
range.VirtualAddress = addr;
|
range.VirtualAddress = addr;
|
||||||
|
@ -217,6 +220,9 @@ struct llama_mmap {
|
||||||
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
llama_format_win_err(GetLastError()).c_str());
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
||||||
|
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
||||||
}
|
}
|
||||||
|
|
||||||
~llama_mmap() {
|
~llama_mmap() {
|
||||||
|
@ -338,8 +344,8 @@ struct llama_mlock {
|
||||||
// Hopefully a megabyte is enough overhead:
|
// Hopefully a megabyte is enough overhead:
|
||||||
size_t increment = size + 1048576;
|
size_t increment = size + 1048576;
|
||||||
// The minimum must be <= the maximum, so we need to increase both:
|
// The minimum must be <= the maximum, so we need to increase both:
|
||||||
min_ws_size += size;
|
min_ws_size += increment;
|
||||||
max_ws_size += size;
|
max_ws_size += increment;
|
||||||
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
||||||
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
|
||||||
llama_format_win_err(GetLastError()).c_str());
|
llama_format_win_err(GetLastError()).c_str());
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue