Merge branch 'master' into multi-thread-quantize

This commit is contained in:
Georgi Gerganov 2023-04-20 20:41:29 +03:00 committed by GitHub
commit b3545d9a2a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 615 additions and 140 deletions

View file

@ -81,7 +81,6 @@ jobs:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
accelerate: [ON, OFF]
steps:
- name: Clone
@ -99,7 +98,7 @@ jobs:
run: |
mkdir build
cd build
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_ACCELERATE=${{ matrix.accelerate }}
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }}
- name: Test

View file

@ -110,6 +110,7 @@ if (APPLE AND LLAMA_ACCELERATE)
message(WARNING "Accelerate framework not found")
endif()
endif()
if (LLAMA_OPENBLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
@ -150,6 +151,10 @@ if (LLAMA_CUBLAS)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")
enable_language(CUDA)
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
add_compile_definitions(GGML_USE_CUBLAS)
if (LLAMA_STATIC)
@ -241,21 +246,26 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
message(STATUS "x86 detected")
if (MSVC)
if (LLAMA_AVX512)
add_compile_options(/arch:AVX512)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX512>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
# Do it manually.
if (LLAMA_AVX512_VBMI)
add_compile_definitions(__AVX512VBMI__)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
endif()
if (LLAMA_AVX512_VNNI)
add_compile_definitions(__AVX512VNNI__)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
elseif (LLAMA_AVX2)
add_compile_options(/arch:AVX2)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX2>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX2>)
elseif (LLAMA_AVX)
add_compile_options(/arch:AVX)
add_compile_options($<$<COMPILE_LANGUAGE:C>:/arch:AVX>)
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/arch:AVX>)
endif()
else()
if (LLAMA_F16C)
@ -292,7 +302,8 @@ endif()
add_library(ggml OBJECT
ggml.c
ggml.h)
ggml.h
${GGML_CUDA_SOURCES})
target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_11) # don't bump
@ -314,6 +325,14 @@ if (BUILD_SHARED_LIBS)
target_compile_definitions(llama PRIVATE LLAMA_SHARED LLAMA_BUILD)
endif()
if (GGML_CUDA_SOURCES)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES OFF)
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
set_property(TARGET llama PROPERTY CUDA_ARCHITECTURES OFF)
endif()
#
# programs, examples and tests
#

View file

@ -1,3 +1,6 @@
# Define the default target now so that it is always the first target
default: main quantize quantize-stats perplexity embedding vdot
ifndef UNAME_S
UNAME_S := $(shell uname -s)
endif
@ -99,7 +102,10 @@ ifdef LLAMA_OPENBLAS
endif
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include
LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -L/usr/local/cuda/lib64
LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -lrt -L/usr/local/cuda/lib64
OBJS += ggml-cuda.o
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
nvcc -arch=native -c -o $@ $<
endif
ifdef LLAMA_GPROF
CFLAGS += -pg
@ -137,8 +143,6 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV))
$(info )
default: main quantize quantize-stats perplexity embedding vdot
#
# Build library
#
@ -155,35 +159,35 @@ common.o: examples/common.cpp examples/common.h
clean:
rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult
main: examples/main/main.cpp ggml.o llama.o common.o
main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./main -h for help. ===='
@echo
quantize: examples/quantize/quantize.cpp ggml.o llama.o
quantize: examples/quantize/quantize.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
vdot: pocs/vdot/vdot.cpp ggml.o
vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
libllama.so: llama.o ggml.o
libllama.so: llama.o ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
#
# Tests
#
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o
benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS)
./benchmark-q4_0-matmult

View file

@ -9,7 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
**Warnings**
- `Q4_2` and `Q4_3` are still in development. Do not expect any kind of backward compatibility until they are finalize
- `Q4_2` and `Q4_3` are still in development. Do not expect any kind of backward compatibility until they are finalized
**Hot topics:**
@ -19,7 +19,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
## Description
The main goal is to run the model using 4-bit quantization on a MacBook
The main goal of llama.cpp is to run the llama model using 4-bit quantization on a MacBook.
- Plain C/C++ implementation without dependencies
- Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
@ -156,7 +156,7 @@ https://user-images.githubusercontent.com/1991296/224442907-7693d4be-acaa-4e01-8
## Usage
Here are the step for the LLaMA-7B model.
Here are the steps for the LLaMA-7B model.
### Get the Code
@ -214,8 +214,7 @@ When running the larger models, make sure you have enough disk space to store al
### Memory/Disk Requirements
As the models are currently fully loaded into memory, you will need adequate disk space to save them
and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
| model | original size | quantized size (4-bit) |
|-------|---------------|------------------------|
@ -227,18 +226,18 @@ and sufficient RAM to load them. At the moment, memory and disk requirements are
### Interactive mode
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
In this mode, you can always interrupt generation by pressing Ctrl+C and enter one or more lines of text which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt which makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
Here is an example few-shot interaction, invoked with the command
Here is an example of a few-shot interaction, invoked with the command
```bash
# default arguments using 7B model
# default arguments using a 7B model
./examples/chat.sh
# advanced chat with 13B model
# advanced chat with a 13B model
./examples/chat-13B.sh
# custom arguments using 13B model
# custom arguments using a 13B model
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
```
@ -277,7 +276,7 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
- Obtain the `gpt4all-lora-quantized.bin` model
- It is distributed in the old `ggml` format which is now obsoleted
- It is distributed in the old `ggml` format, which is now obsoleted
- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
@ -291,7 +290,7 @@ convert the model from the old format to the new format with [./migrate-ggml-202
### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data
- **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
- **Under no circumstances should IPFS, magnet links, or any other links to model downloads be shared anywhere in this repository, including in issues, discussions, or pull requests. They will be immediately deleted.**
- The LLaMA models are officially distributed by Facebook and will **never** be provided through this repository.
- Refer to [Facebook's LLaMA repository](https://github.com/facebookresearch/llama/pull/73/files) if you need to request access to the model data.
- Please verify the [sha256 checksums](SHA256SUMS) of all downloaded model files to confirm that you have the correct model data files before creating an issue relating to your model files.
@ -303,29 +302,27 @@ convert the model from the old format to the new format with [./migrate-ggml-202
`shasum -a 256 --ignore-missing -c SHA256SUMS` on macOS
- If your issue is with model generation quality then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
- LLaMA:
- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
- [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
- GPT-3
- [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
- GPT-3.5 / InstructGPT / ChatGPT:
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
- If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:
- LLaMA:
- [Introducing LLaMA: A foundational, 65-billion-parameter large language model](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
- [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
- GPT-3
- [Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165)
- GPT-3.5 / InstructGPT / ChatGPT:
- [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
- [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
### Perplexity (Measuring model quality)
### Perplexity (measuring model quality)
You can use the `perplexity` example to measure perplexity over the given prompt. For more background,
see https://huggingface.co/docs/transformers/perplexity. However, in general, lower perplexity is better for LLMs.
You can use the `perplexity` example to measure perplexity over the given prompt. For more background, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity). However, in general, lower perplexity is better for LLMs.
#### Latest measurements
The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406). `llama.cpp` is measuring very well
compared to the baseline implementations. Quantization has a small negative impact to quality, but, as you can see, running
The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406). `llama.cpp` is measuring very well compared to the baseline implementations. Quantization has a small negative impact on quality, but, as you can see, running
13B at q4_0 beats the 7B f16 model by a significant amount.
All measurements are done against wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context).
Note that the changing the context length will have a significant impact on perplexity (longer context = better perplexity).
All measurements are done against the wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context).
Note that changing the context length will have a significant impact on perplexity (longer context = better perplexity).
```
Perplexity - model options
5.5985 - 13B, q4_0
@ -367,7 +364,7 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0
#### Prerequisites
* Docker must be installed and running on your system.
* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
* Create a folder to store big models & intermediate files (ex. /llama/models)
#### Images
We have two Docker images available for this project:
@ -381,17 +378,17 @@ The easiest way to download the models, convert them to ggml and optimize them i
Replace `/path/to/models` below with the actual path where you downloaded the models.
```bash
```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
```
On complete, you are ready to play!
On completion, you are ready to play!
```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
```
or with light image:
or with a light image:
```bash
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
@ -412,7 +409,7 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode
- Always consider cross-compatibility with other operating systems and architectures
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
- Clean-up any trailing whitespaces, use 4 spaces indentation, brackets on same line, `void * ptr`, `int & a`
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
### Docs

View file

@ -15,6 +15,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
fprintf(stderr, " type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
return 1;
}

116
ggml-cuda.cu Normal file
View file

@ -0,0 +1,116 @@
#include <stdint.h>
#include <cuda_fp16.h>
#include "ggml-cuda.h"
typedef uint16_t ggml_fp16_t;
static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
#define QK4_0 32
typedef struct {
float d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0;
static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
#define QK4_1 32
typedef struct {
float d; // delta
float m; // min
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
#define QK4_2 16
typedef struct {
__half d; // delta
uint8_t qs[QK4_2 / 2]; // nibbles / quants
} block_q4_2;
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
const block_q4_0 * x = (const block_q4_0 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
const uint8_t * pp = x[i].qs;
for (int l = 0; l < QK4_0; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;
const float v0 = (vi0 - 8)*d;
const float v1 = (vi1 - 8)*d;
y[i*QK4_0 + l + 0] = v0;
y[i*QK4_0 + l + 1] = v1;
}
}
static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
const block_q4_1 * x = (const block_q4_1 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
const float m = x[i].m;
const uint8_t * pp = x[i].qs;
for (int l = 0; l < QK4_1; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;
const float v0 = vi0*d + m;
const float v1 = vi1*d + m;
y[i*QK4_1 + l + 0] = v0;
y[i*QK4_1 + l + 1] = v1;
}
}
static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
const block_q4_2 * x = (const block_q4_2 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
const uint8_t * pp = x[i].qs;
for (int l = 0; l < QK4_2; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;
const float v0 = (vi0 - 8)*d;
const float v1 = (vi1 - 8)*d;
y[i*QK4_2 + l + 0] = v0;
y[i*QK4_2 + l + 1] = v1;
}
}
extern "C" {
__host__ void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK4_0;
dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
}
__host__ void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK4_1;
dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
}
__host__ void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK4_2;
dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
}
}

11
ggml-cuda.h Normal file
View file

@ -0,0 +1,11 @@
#ifdef __cplusplus
extern "C" {
#endif
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
#ifdef __cplusplus
}
#endif

491
ggml.c
View file

@ -150,23 +150,25 @@ inline static void* ggml_aligned_malloc(size_t size) {
#elif defined(GGML_USE_CUBLAS)
#include <cublas_v2.h>
#include <cuda_runtime.h>
#define CUDA_CHECK(err) \
do { \
cudaError_t err_ = (err); \
if (err_ != cudaSuccess) { \
printf("CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
cudaGetErrorString(err_)); \
exit(1); \
} \
#include "ggml-cuda.h"
#define CUDA_CHECK(err) \
do { \
cudaError_t err_ = (err); \
if (err_ != cudaSuccess) { \
printf("CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
cudaGetErrorString(err_)); \
exit(1); \
} \
} while (0)
#define CUBLAS_CHECK(err) \
do { \
cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \
printf("cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
exit(1); \
} \
#define CUBLAS_CHECK(err) \
do { \
cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \
printf("cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)
static cublasHandle_t cublasH = NULL;
@ -177,6 +179,7 @@ static void init_cublas(void) {
CUBLAS_CHECK(cublasCreate(&cublasH));
CUDA_CHECK(cudaStreamCreateWithFlags(&cudaStream, cudaStreamNonBlocking));
CUBLAS_CHECK(cublasSetStream(cublasH, cudaStream));
// configure logging to stdout
@ -464,12 +467,30 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
// quantization
//
// AVX routines provided by GH user Const-me
// ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600
#if __AVX__ || __AVX2__ || __AVX512F__
// Unpack 16 4-bit fields into 16 bytes
// The output vector contains 16 bytes, each one in [ 0 .. 15 ] interval
static inline __m128i bytes_from_nibbles_16(const uint8_t * rsi)
{
// Load 8 bytes from memory
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
// Expand bytes into uint16_t values
__m128i bytes = _mm_cvtepu8_epi16( tmp );
// Unpack values into individual bytes
const __m128i lowMask = _mm_set1_epi8( 0xF );
__m128i high = _mm_andnot_si128( lowMask, bytes );
__m128i low = _mm_and_si128( lowMask, bytes );
high = _mm_slli_epi16( high, 4 );
bytes = _mm_or_si128( low, high );
return bytes;
}
#if __AVX2__ || __AVX512F__
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
static inline __m256i bytesFromNibbles( const uint8_t* rsi )
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
{
// Load 16 bytes from memory
__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
@ -500,24 +521,7 @@ static inline __m128i packNibbles( __m256i bytes )
__m128i r1 = _mm256_extracti128_si256( bytes, 1 );
return _mm_packus_epi16( r0, r1 );
}
#elif __AVX__
static inline __m128i bytesFromNibbles( const uint8_t* rsi )
{
// Load 8 bytes from memory
__m128i tmp = _mm_loadu_si64( ( const __m128i* )rsi );
// Expand bytes into uint16_t values
__m128i bytes = _mm_cvtepu8_epi16( tmp );
// Unpack values into individual bytes
const __m128i lowMask = _mm_set1_epi8( 0xF );
__m128i high = _mm_andnot_si128( lowMask, bytes );
__m128i low = _mm_and_si128( lowMask, bytes );
high = _mm_slli_epi16( high, 4 );
bytes = _mm_or_si128( low, high );
return bytes;
}
#else
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
{
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
@ -534,6 +538,7 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
return _mm_packus_epi16( bytes1, bytes2);
}
#endif
#endif // __AVX__ || __AVX2__ || __AVX512F__
#if __ARM_NEON
@ -632,7 +637,7 @@ typedef struct {
float m; // min
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
static_assert(sizeof(block_q4_1) == 2 * sizeof(float) + QK4_1 / 2, "wrong q4_1 block size/padding");
#define QK4_2 16
typedef struct {
@ -641,6 +646,14 @@ typedef struct {
} block_q4_2;
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
#define QK4_3 16
typedef struct {
ggml_fp16_t d; // delta
ggml_fp16_t m; // min
uint8_t qs[QK4_3 / 2]; // nibbles / quants
} block_q4_3;
static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
#define QK8_0 32
typedef struct {
float d; // delta
@ -1198,7 +1211,6 @@ static void quantize_row_q4_2_rmse(const float * restrict x, block_q4_2 * restri
const int nb = k / QK4_2;
for (int i = 0; i < nb; i++) {
float scale = kquantize_q4_with_bounds(QK4_2, -8, 7, x, CANDIDATE_COUNT, candidates, L);
y[i].d = GGML_FP32_TO_FP16(scale);
@ -1226,6 +1238,49 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
quantize_row_q4_2_rmse(x, y, k);
}
static void quantize_row_q4_3_reference(const float * restrict x, block_q4_3 * restrict y, int k) {
assert(k % QK4_3 == 0);
const int nb = k / QK4_3;
for (int i = 0; i < nb; i++) {
float min = FLT_MAX;
float max = -FLT_MAX;
for (int l = 0; l < QK4_3; l++) {
const float v = x[i*QK4_3 + l];
if (v < min) min = v;
if (v > max) max = v;
}
const float d = (max - min) / ((1 << 4) - 1);
const float id = d ? 1.0f/d : 0.0f;
y[i].d = GGML_FP32_TO_FP16(d);
y[i].m = GGML_FP32_TO_FP16(min);
for (int l = 0; l < QK4_3; l += 2) {
const float v0 = (x[i*QK4_3 + l + 0] - min)*id;
const float v1 = (x[i*QK4_3 + l + 1] - min)*id;
const uint8_t vi0 = (int) (v0 + 0.5f);
const uint8_t vi1 = (int) (v1 + 0.5f);
assert(vi0 < 16);
assert(vi1 < 16);
y[i].qs[l/2] = vi0 | (vi1 << 4);
}
}
}
static void quantize_row_q4_3(const float * restrict x, void * restrict vy, int k) {
assert(k % QK4_3 == 0);
block_q4_3 * restrict y = vy;
quantize_row_q4_3_reference(x, y, k);
}
// reference implementation for deterministic creation of model files
static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) {
assert(k % QK8_0 == 0);
@ -1392,7 +1447,7 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
for (int l = 0; l < QK4_0; l += 32) {
// Load 32x4-bit integers into 32x8-bit integers
__m256i vx8 = bytesFromNibbles(pp+l/2);
__m256i vx8 = bytes_from_nibbles_32(pp+l/2);
// Subtract 8 from the integers
vx8 = _mm256_sub_epi8(vx8, _mm256_set1_epi8(8));
@ -1510,7 +1565,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in
for (int l = 0; l < QK4_1; l += 32) {
// Load 32x4-bit integers into 32x8-bit integers
__m256i vx8 = bytesFromNibbles(pp+l/2);
__m256i vx8 = bytes_from_nibbles_32(pp+l/2);
// Convert to 16-bit int
const __m256i vx16_lo = _mm256_cvtepi8_epi16(_mm256_extracti128_si256(vx8, 0));
@ -1630,9 +1685,40 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in
}
}
static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, int k) {
assert(k % QK4_3 == 0);
const int nb = k / QK4_3;
const block_q4_3 * restrict x = vx;
for (int i = 0; i < nb; i++) {
const float d = GGML_FP16_TO_FP32(x[i].d);
const float m = GGML_FP16_TO_FP32(x[i].m);
const uint8_t * restrict pp = x[i].qs;
for (int l = 0; l < QK4_3; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;
const float v0 = vi0*d + m;
const float v1 = vi1*d + m;
y[i*QK4_3 + l + 0] = v0;
y[i*QK4_3 + l + 1] = v1;
assert(!isnan(y[i*QK4_3 + l + 0]));
assert(!isnan(y[i*QK4_3 + l + 1]));
}
}
}
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q4_0] = {
@ -1656,6 +1742,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
.quantize_row_q_dot = quantize_row_q8_0,
.vec_dot_q = ggml_vec_dot_q4_2_q8_0,
},
[GGML_TYPE_Q4_3] = {
.dequantize_row_q = dequantize_row_q4_3,
.quantize_row_q = quantize_row_q4_3,
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference, // TODO: RMSE optimization
.quantize_row_q_dot = quantize_row_q8_0,
.vec_dot_q = ggml_vec_dot_q4_3_q8_0,
},
[GGML_TYPE_Q8_0] = {
.dequantize_row_q = NULL, // TODO
.quantize_row_q = quantize_row_q8_0,
@ -2353,7 +2446,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
/* Compute combined scale for the block */
const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
__m256i bx = bytesFromNibbles(x[i].qs);
__m256i bx = bytes_from_nibbles_32(x[i].qs);
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
const __m256i off = _mm256_set1_epi8( 8 );
@ -2399,7 +2492,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
__m128i i32[2];
for (int j = 0; j < 2; ++j) {
// Load 8 bytes, and unpack 4 bit fields into bytes, making 16 bytes
__m128i bx = bytesFromNibbles( x[i].qs + 8*j );
__m128i bx = bytes_from_nibbles_16(x[i].qs + 8*j);
__m128i by = _mm_loadu_si128((const __m128i *)(y[i].qs + 16*j));
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
@ -2564,7 +2657,7 @@ static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void *
const __m256 d1m0 = _mm256_mul_ps( d1v, m0v );
// Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
const __m256i bx = bytesFromNibbles( x[i].qs );
const __m256i bx = bytes_from_nibbles_32(x[i].qs);
const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
// Get absolute values of x vectors
@ -2650,6 +2743,7 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
const block_q4_2 * restrict x0_1 = &x[2*(i + 0) + 1];
const block_q4_2 * restrict x1_0 = &x[2*(i + 1) + 0];
const block_q4_2 * restrict x1_1 = &x[2*(i + 1) + 1];
const block_q8_0 * restrict y0 = &y[i + 0];
const block_q8_0 * restrict y1 = &y[i + 1];
@ -2718,6 +2812,51 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
}
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
#elif defined(__AVX2__)
// Initialize accumulator with zeros
__m256 acc = _mm256_setzero_ps();
// Main loop
for (int i = 0; i < nb; i++) {
/* Compute combined scale for the block */
const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
const __m256 d = _mm256_mul_ps(_mm256_set_m128(d1, d0), _mm256_broadcast_ss(&y[i].d));
__m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
__m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
__m256i bx = _mm256_set_m128i(bx1, bx0);
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
const __m256i off = _mm256_set1_epi8(8);
bx = _mm256_sub_epi8(bx, off);
__m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
// Get absolute values of x vectors
const __m256i ax = _mm256_sign_epi8(bx, bx);
// Sign the values of the y vectors
const __m256i sy = _mm256_sign_epi8(by, bx);
// Perform multiplication and create 16-bit values
const __m256i dot = _mm256_maddubs_epi16(ax, sy);
const __m256i ones = _mm256_set1_epi16(1);
__m256i xy_q = _mm256_madd_epi16(ones, dot);
/* Convert to vectore of 8 int32_t to 8 floats */
__m256 q = _mm256_cvtepi32_ps(xy_q);
/* Multiply q with scale and accumulate */
acc = _mm256_fmadd_ps(d, q, acc);
}
// Return horizontal sum of the acc vector
__m128 res = _mm256_extractf128_ps(acc, 1);
res = _mm_add_ps(res, _mm256_castps256_ps128(acc));
res = _mm_add_ps(res, _mm_movehl_ps(res, res));
res = _mm_add_ss(res, _mm_movehdup_ps(res));
sumf = _mm_cvtss_f32(res);
#else
// scalar
for (int i = 0; i < nb; i++) {
@ -2759,6 +2898,154 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
*s = sumf;
}
static void ggml_vec_dot_q4_3_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
const int nb = n / QK8_0;
assert(n % QK8_0 == 0);
assert(nb % 2 == 0);
assert(QK8_0 == 2*QK4_2);
const block_q4_3 * restrict x = vx;
const block_q8_0 * restrict y = vy;
float sumf = 0.0;
#if defined(__ARM_NEON)
float32x4_t sumv0 = vdupq_n_f32(0.0f);
float32x4_t sumv1 = vdupq_n_f32(0.0f);
for (int i = 0; i < nb; i += 2) {
const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0];
const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1];
const block_q4_3 * restrict x1_0 = &x[2*(i + 1) + 0];
const block_q4_3 * restrict x1_1 = &x[2*(i + 1) + 1];
const block_q8_0 * restrict y0 = &y[i + 0];
const block_q8_0 * restrict y1 = &y[i + 1];
const uint8x16_t m4b = vdupq_n_u8(0xf);
const float x0_0d = GGML_FP16_TO_FP32(x0_0->d);
const float x0_1d = GGML_FP16_TO_FP32(x0_1->d);
const float x1_0d = GGML_FP16_TO_FP32(x1_0->d);
const float x1_1d = GGML_FP16_TO_FP32(x1_1->d);
const float x0_0m = GGML_FP16_TO_FP32(x0_0->m);
const float x0_1m = GGML_FP16_TO_FP32(x0_1->m);
const float x1_0m = GGML_FP16_TO_FP32(x1_0->m);
const float x1_1m = GGML_FP16_TO_FP32(x1_1->m);
const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
const uint8x16_t v0_1 = vcombine_u8(vld1_u8(x1_0->qs), vld1_u8(x1_1->qs));
// 4-bit -> 8-bit
const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, m4b));
const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8 (v0_1, m4b));
const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));
// interleave
const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
const int8x16_t v0_1lz = vzip1q_s8(v0_1l, v0_1h);
const int8x16_t v0_1hz = vzip2q_s8(v0_1l, v0_1h);
// load y
const int8x16_t v1_0l = vld1q_s8(y0->qs);
const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
const int8x16_t v1_1l = vld1q_s8(y1->qs);
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
const int16x8_t sy0_0 = vaddq_s16(vmovl_s8(vget_low_s8(v1_0l)), vmovl_s8(vget_high_s8(v1_0l)));
const int16x8_t sy0_1 = vaddq_s16(vmovl_s8(vget_low_s8(v1_0h)), vmovl_s8(vget_high_s8(v1_0h)));
const int16x8_t sy1_0 = vaddq_s16(vmovl_s8(vget_low_s8(v1_1l)), vmovl_s8(vget_high_s8(v1_1l)));
const int16x8_t sy1_1 = vaddq_s16(vmovl_s8(vget_low_s8(v1_1h)), vmovl_s8(vget_high_s8(v1_1h)));
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddl_s16(vget_low_s16(sy0_0), vget_high_s16(sy0_0))), x0_0m*y0->d);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddl_s16(vget_low_s16(sy0_1), vget_high_s16(sy0_1))), x0_1m*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddl_s16(vget_low_s16(sy1_0), vget_high_s16(sy1_0))), x1_0m*y1->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddl_s16(vget_low_s16(sy1_1), vget_high_s16(sy1_1))), x1_1m*y1->d);
#if defined(__ARM_FEATURE_DOTPROD)
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), x0_0d*y0->d);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), x0_1d*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l)), x1_0d*y1->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_1hz, v1_1h)), x1_1d*y1->d);
#else
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(pl0), x0_0d*y0->d);
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(ph0), x0_1d*y0->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(pl1), x1_0d*y1->d);
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(ph1), x1_1d*y1->d);
#endif
}
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
#else
// scalar
for (int i = 0; i < nb; i++) {
const uint8_t * restrict x0 = x[2*i + 0].qs;
const uint8_t * restrict x1 = x[2*i + 1].qs;
const int8_t * restrict y0 = y[i].qs;
const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d);
const float m0 = GGML_FP16_TO_FP32(x[2*i + 0].m);
const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
const float m1 = GGML_FP16_TO_FP32(x[2*i + 1].m);
int sy_0 = 0;
int sy_1 = 0;
int sxy_0 = 0;
int sxy_1 = 0;
for (int j = 0; j < QK8_0/4; j++) {
const uint8_t v0 = x0[j];
const uint8_t v1 = x1[j];
const int x0_0 = v0 & 0xf;
const int x1_0 = v0 >> 4;
const int x0_1 = v1 & 0xf;
const int x1_1 = v1 >> 4;
const int y0_0 = y0[2*j + 0];
const int y1_0 = y0[2*j + 1];
const int y0_1 = y0[2*(j + QK8_0/4) + 0];
const int y1_1 = y0[2*(j + QK8_0/4) + 1];
sy_0 += y0_0 + y1_0;
sy_1 += y0_1 + y1_1;
sxy_0 += x0_0*y0_0 + x1_0*y1_0;
sxy_1 += x0_1*y0_1 + x1_1*y1_1;
}
sumf += (d0*sxy_0 + m0*sy_0)*y[i].d;
sumf += (d1*sxy_1 + m1*sy_1)*y[i].d;
}
#endif
*s = sumf;
}
// compute GGML_VEC_DOT_UNROLL dot products at once
// xs - x row stride in bytes
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
@ -3006,12 +3293,13 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q4_0] = QK4_0,
[GGML_TYPE_Q4_1] = QK4_1,
[GGML_TYPE_Q4_2] = QK4_2,
[GGML_TYPE_Q4_3] = QK4_3,
[GGML_TYPE_Q8_0] = QK8_0,
[GGML_TYPE_I8] = 1,
[GGML_TYPE_I16] = 1,
[GGML_TYPE_I32] = 1,
};
static_assert(GGML_TYPE_COUNT == 9, "GGML_BLCK_SIZE is outdated");
static_assert(GGML_TYPE_COUNT == 10, "GGML_BLCK_SIZE is outdated");
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = sizeof(float),
@ -3019,12 +3307,13 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
[GGML_TYPE_Q4_2] = sizeof(block_q4_2),
[GGML_TYPE_Q4_3] = sizeof(block_q4_3),
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
[GGML_TYPE_I8] = sizeof(int8_t),
[GGML_TYPE_I16] = sizeof(int16_t),
[GGML_TYPE_I32] = sizeof(int32_t),
};
static_assert(GGML_TYPE_COUNT == 9, "GGML_TYPE_SIZE is outdated");
static_assert(GGML_TYPE_COUNT == 10, "GGML_TYPE_SIZE is outdated");
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
@ -3033,12 +3322,13 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q4_0] = "q4_0",
[GGML_TYPE_Q4_1] = "q4_1",
[GGML_TYPE_Q4_2] = "q4_2",
[GGML_TYPE_Q4_3] = "q4_3",
[GGML_TYPE_Q8_0] = "q8_0",
[GGML_TYPE_I8] = "i8",
[GGML_TYPE_I16] = "i16",
[GGML_TYPE_I32] = "i32",
};
static_assert(GGML_TYPE_COUNT == 9, "GGML_TYPE_NAME is outdated");
static_assert(GGML_TYPE_COUNT == 10, "GGML_TYPE_NAME is outdated");
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
[GGML_TYPE_F32] = false,
@ -3046,12 +3336,13 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
[GGML_TYPE_Q4_0] = true,
[GGML_TYPE_Q4_1] = true,
[GGML_TYPE_Q4_2] = true,
[GGML_TYPE_Q4_3] = true,
[GGML_TYPE_Q8_0] = true,
[GGML_TYPE_I8] = false,
[GGML_TYPE_I16] = false,
[GGML_TYPE_I32] = false,
};
static_assert(GGML_TYPE_COUNT == 9, "GGML_IS_QUANTIZED is outdated");
static_assert(GGML_TYPE_COUNT == 10, "GGML_IS_QUANTIZED is outdated");
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
"NONE",
@ -3313,7 +3604,7 @@ static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct
(t0->ne[3] == t1->ne[3]);
}
static inline bool ggml_is_quantized(enum ggml_type type) {
bool ggml_is_quantized(enum ggml_type type) {
return GGML_IS_QUANTIZED[type];
}
@ -6263,6 +6554,7 @@ static void ggml_compute_forward_add(
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q4_3:
{
ggml_compute_forward_add_q_f32(params, src0, src1, dst);
} break;
@ -7311,7 +7603,6 @@ static void ggml_compute_forward_mul_mat_f32(
// copy data to host
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
CUDA_CHECK(cudaStreamSynchronize(cudaStream));
#else
// zT = y * xT
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@ -7323,6 +7614,7 @@ static void ggml_compute_forward_mul_mat_f32(
}
}
#if defined(GGML_USE_CUBLAS)
CUDA_CHECK(cudaStreamSynchronize(cudaStream));
CUDA_CHECK(cudaFree(d_X));
CUDA_CHECK(cudaFree(d_Y));
CUDA_CHECK(cudaFree(d_D));
@ -7535,7 +7827,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
// copy data to host
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
CUDA_CHECK(cudaStreamSynchronize(cudaStream));
#else
const float * x = wdata;
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
@ -7553,6 +7844,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
}
#if defined(GGML_USE_CUBLAS)
CUDA_CHECK(cudaStreamSynchronize(cudaStream));
CUDA_CHECK(cudaFree(d_X));
CUDA_CHECK(cudaFree(d_Y));
CUDA_CHECK(cudaFree(d_D));
@ -7722,13 +8014,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
return;
}
float * const wdata = params->wdata;
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
#if defined(GGML_USE_CUBLAS)
float *d_X = NULL;
float *d_Y = NULL;
float *d_D = NULL;
float *d_Q = NULL;
const float alpha = 1.0f;
const float beta = 0.0f;
const int x_ne = ne01 * ne10;
@ -7738,10 +8028,44 @@ static void ggml_compute_forward_mul_mat_q_f32(
CUDA_CHECK(cudaMalloc((void **)(&d_X), sizeof(float) * x_ne));
CUDA_CHECK(cudaMalloc((void **)(&d_Y), sizeof(float) * y_ne));
CUDA_CHECK(cudaMalloc((void **)(&d_D), sizeof(float) * d_ne));
CUDA_CHECK(cudaMalloc((void **)(&d_Q), GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type]));
void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream) = NULL;
if (type == GGML_TYPE_Q4_0) {
dequantize_row_q_cuda = dequantize_row_q4_0_cuda;
}
else if (type == GGML_TYPE_Q4_1) {
dequantize_row_q_cuda = dequantize_row_q4_1_cuda;
}
else if (type == GGML_TYPE_Q4_2) {
dequantize_row_q_cuda = dequantize_row_q4_2_cuda;
}
else if (type == GGML_TYPE_Q4_3) {
dequantize_row_q_cuda = dequantize_row_q4_3_cuda;
}
else {
GGML_ASSERT(false);
}
#else
float * const wdata = params->wdata;
dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
#endif
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
#if defined(GGML_USE_CUBLAS)
// copy and dequantize on device
CUDA_CHECK(
cudaMemcpyAsync(d_Q, (char *) src0->data + i03*nb03 + i02*nb02,
GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], cudaMemcpyHostToDevice, cudaStream));
dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, cudaStream);
CUDA_CHECK(cudaGetLastError());
#else
{
size_t id = 0;
for (int64_t i01 = 0; i01 < ne01; ++i01) {
@ -7749,15 +8073,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
id += ne00;
}
}
const float * x = wdata;
const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
#endif
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
#if defined(GGML_USE_CUBLAS)
// copy data to device
CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, cudaStream));
CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, cudaStream));
// compute
@ -7770,7 +8091,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
// copy data to host
CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
CUDA_CHECK(cudaStreamSynchronize(cudaStream));
#else
// zT = y * xT
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
@ -7783,9 +8103,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
}
#if defined(GGML_USE_CUBLAS)
CUDA_CHECK(cudaStreamSynchronize(cudaStream));
CUDA_CHECK(cudaFree(d_X));
CUDA_CHECK(cudaFree(d_Y));
CUDA_CHECK(cudaFree(d_D));
CUDA_CHECK(cudaFree(d_Q));
#endif
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
@ -7875,6 +8197,7 @@ static void ggml_compute_forward_mul_mat(
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q4_3:
case GGML_TYPE_Q8_0:
{
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
@ -7892,34 +8215,6 @@ static void ggml_compute_forward_mul_mat(
GGML_ASSERT(false);
} break;
}
#if 0
if (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_Q4_1) {
static int first = 8;
printf("src0: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src0->ne[0], src0->ne[1], src0->ne[2]);
printf("src1: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src1->ne[0], src1->ne[1], src1->ne[2]);
printf("dst: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
if (first) {
--first;
} else {
for (int k = 0; k < dst->ne[1]; ++k) {
for (int j = 0; j < dst->ne[0]/16; ++j) {
for (int i = 0; i < 16; ++i) {
printf("%8.4f ", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);
}
printf("\n");
}
printf("\n");
}
printf("\n");
exit(0);
}
} else {
printf("aaaa src0: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src0->ne[0], src0->ne[1], src0->ne[2]);
printf("aaaa src1: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", src1->ne[0], src1->ne[1], src1->ne[2]);
printf("aaaa dst: ne0 = %5d, ne1 = %5d, ne2 = %5d\n", dst->ne[0], dst->ne[1], dst->ne[2]);
}
#endif
}
// ggml_compute_forward_scale
@ -8131,6 +8426,7 @@ static void ggml_compute_forward_get_rows(
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q4_3:
case GGML_TYPE_Q8_0:
{
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
@ -11870,6 +12166,29 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
return (n/QK4_2*sizeof(block_q4_2));
}
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist) {
assert(k % QK4_3 == 0);
const int nb = k / QK4_3;
for (int j = 0; j < n; j += k) {
block_q4_3 * restrict y = (block_q4_3 *)dst + j/QK4_3;
quantize_row_q4_3_reference(src + j, y, k);
for (int i = 0; i < nb; i++) {
for (int l = 0; l < QK4_3; l += 2) {
const uint8_t vi0 = y[i].qs[l/2] & 0xF;
const uint8_t vi1 = y[i].qs[l/2] >> 4;
hist[vi0]++;
hist[vi1]++;
}
}
}
return (n/QK4_3*sizeof(block_q4_3));
}
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
size_t result = 0;
switch (type) {

6
ggml.h
View file

@ -205,7 +205,8 @@ enum ggml_type {
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
GGML_TYPE_Q4_2 = 4,
GGML_TYPE_Q8_0 = 5,
GGML_TYPE_Q4_3 = 5,
GGML_TYPE_Q8_0 = 6,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
@ -360,6 +361,8 @@ const char * ggml_type_name(enum ggml_type type);
size_t ggml_element_size(const struct ggml_tensor * tensor);
bool ggml_is_quantized(enum ggml_type type);
struct ggml_context * ggml_init(struct ggml_init_params params);
void ggml_free(struct ggml_context * ctx);
@ -808,6 +811,7 @@ enum ggml_opt_result ggml_opt(
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);

View file

@ -482,6 +482,7 @@ struct llama_file_loader {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q4_3:
break;
default: {
throw format("unrecognized tensor type %u\n", shard.type);
@ -555,6 +556,7 @@ struct llama_file_saver {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q4_2:
case GGML_TYPE_Q4_3:
break;
default: LLAMA_ASSERT(false);
}
@ -844,6 +846,7 @@ static const char *llama_ftype_name(enum llama_ftype ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
return "mostly Q4_1, some F16";
case LLAMA_FTYPE_MOSTLY_Q4_2: return "mostly Q4_2";
case LLAMA_FTYPE_MOSTLY_Q4_3: return "mostly Q4_3";
default: return "unknown, may not work";
}
}
@ -1578,6 +1581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
case LLAMA_FTYPE_MOSTLY_Q4_1: quantized_type = GGML_TYPE_Q4_1; break;
case LLAMA_FTYPE_MOSTLY_Q4_2: quantized_type = GGML_TYPE_Q4_2; break;
case LLAMA_FTYPE_MOSTLY_Q4_3: quantized_type = GGML_TYPE_Q4_3; break;
default: throw format("invalid output file type %d\n", ftype);
};
@ -1990,7 +1994,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
base_t = dest_t;
}
if (base_t->type == GGML_TYPE_Q4_0 || base_t->type == GGML_TYPE_Q4_1 || base_t->type == GGML_TYPE_Q4_2) {
if (ggml_is_quantized(base_t->type)) {
if (!warned) {
fprintf(stderr, "%s: warning: using a lora adapter with a quantized model may result in poor quality, "
"use a f16 or f32 base model with --lora-base\n", __func__);

View file

@ -73,6 +73,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
};
LLAMA_API struct llama_context_params llama_context_default_params();