Merge branch 'master' into betterlogs

This commit is contained in:
staviq 2023-08-26 17:57:10 +02:00 committed by GitHub
commit 5031c50e48
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
35 changed files with 3291 additions and 1358 deletions

View file

@ -0,0 +1,44 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
COPY requirements.txt requirements.txt
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -0,0 +1,44 @@
ARG UBUNTU_VERSION=22.04
# This needs to generally match the container host's environment.
ARG ROCM_VERSION=5.6
# Target the CUDA build image
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
FROM ${BASE_ROCM_DEV_CONTAINER} as build
# Unless otherwise specified, we make a fat build.
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
# This is mostly tied to rocBLAS supported archs.
ARG ROCM_DOCKER_ARCH=\
gfx803 \
gfx900 \
gfx906 \
gfx908 \
gfx90a \
gfx1010 \
gfx1030 \
gfx1100 \
gfx1101 \
gfx1102
COPY requirements.txt requirements.txt
RUN pip install --upgrade pip setuptools wheel \
&& pip install -r requirements.txt
WORKDIR /app
COPY . .
# Set nvcc architecture
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
# Enable ROCm
ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make
ENTRYPOINT [ "/app/main" ]

View file

@ -5,14 +5,7 @@
.vscode/ .vscode/
.DS_Store .DS_Store
build/ build*/
build-em/
build-debug/
build-release/
build-static/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
models/* models/*

17
.gitignore vendored
View file

@ -17,20 +17,7 @@
.vs/ .vs/
.vscode/ .vscode/
build/ build*/
build-em/
build-debug/
build-release/
build-ci-debug/
build-ci-release/
build-static/
build-cublas/
build-opencl/
build-metal/
build-mpi/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
out/ out/
tmp/ tmp/
@ -61,6 +48,7 @@ compile_commands.json
CMakeSettings.json CMakeSettings.json
__pycache__ __pycache__
dist
zig-out/ zig-out/
zig-cache/ zig-cache/
@ -71,7 +59,6 @@ perf-*.txt
examples/jeopardy/results.txt examples/jeopardy/results.txt
pyproject.toml
poetry.lock poetry.lock
poetry.toml poetry.toml

View file

@ -74,6 +74,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels") set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF) option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF) option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_METAL "llama: use Metal" OFF) option(LLAMA_METAL "llama: use Metal" OFF)
option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_MPI "llama: use MPI" OFF)
@ -352,6 +353,43 @@ if (LLAMA_CLBLAST)
endif() endif()
endif() endif()
if (LLAMA_HIPBLAS)
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
endif()
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif()
find_package(hip)
find_package(hipblas)
find_package(rocblas)
if (${hipblas_FOUND} AND ${hip_FOUND})
message(STATUS "HIP and hipBLAS found")
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
if (LLAMA_CUDA_FORCE_DMMV)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
endif()
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
if (LLAMA_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
else()
message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
endif()
endif()
if (LLAMA_ALL_WARNINGS) if (LLAMA_ALL_WARNINGS)
if (NOT MSVC) if (NOT MSVC)
set(c_flags set(c_flags

View file

@ -280,6 +280,30 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
endif # LLAMA_CLBLAST endif # LLAMA_CLBLAST
ifdef LLAMA_HIPBLAS
ROCM_PATH ?= /opt/rocm
HIPCC ?= $(ROCM_PATH)/bin/hipcc
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
LLAMA_CUDA_DMMV_X ?= 32
LLAMA_CUDA_MMV_Y ?= 1
LLAMA_CUDA_KQUANTS_ITER ?= 2
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
HIPFLAGS += -DCC_TURING=1000000000
ifdef LLAMA_CUDA_FORCE_DMMV
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV
OBJS += ggml-cuda.o
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # LLAMA_HIPBLAS
ifdef LLAMA_METAL ifdef LLAMA_METAL
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
CXXFLAGS += -DGGML_USE_METAL CXXFLAGS += -DGGML_USE_METAL

View file

@ -422,6 +422,35 @@ Building the program with BLAS support may lead to some performance improvements
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### hipBLAS
This provide BLAS acceleation on HIP supported GPU like AMD GPU.
Make sure to have ROCm installed.
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
Windows support is coming soon...
- Using `make`:
```bash
make LLAMA_HIPBLAS=1
```
- Using `CMake`:
```bash
mkdir build
cd build
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
cmake --build .
```
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
| Option | Legal values | Default | Description |
|-------------------------|------------------------|---------|-------------|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### CLBlast - #### CLBlast
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU. OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.

View file

@ -391,6 +391,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
ln -sfn ${mnt_models} ${SRC}/models-mnt ln -sfn ${mnt_models} ${SRC}/models-mnt
python3 -m pip install -r ${SRC}/requirements.txt python3 -m pip install -r ${SRC}/requirements.txt
python3 -m pip install --editable gguf-py
fi fi
ret=0 ret=0

View file

@ -635,9 +635,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n"); fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
#ifdef GGML_USE_CUBLAS
fprintf(stdout, " -nommq, --no-mul-mat-q\n"); fprintf(stdout, " -nommq, --no-mul-mat-q\n");
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n"); fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_CUBLAS
#endif #endif
fprintf(stdout, " --mtest compute maximum memory usage\n"); fprintf(stdout, " --mtest compute maximum memory usage\n");
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n"); fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");

View file

@ -30,6 +30,7 @@ struct gpt_params {
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t n_beams = 0; // if non-zero then use beam search of given width.
float rope_freq_base = 10000.0f; // RoPE base frequency float rope_freq_base = 10000.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor float rope_freq_scale = 1.0f; // RoPE frequency scaling factor

View file

@ -105,6 +105,7 @@ class Params:
f_norm_eps: float f_norm_eps: float
f_rope_freq_base: Optional[float] = None f_rope_freq_base: Optional[float] = None
f_rope_scale: Optional[float] = None
ftype: Optional[GGMLFileType] = None ftype: Optional[GGMLFileType] = None
@ -160,13 +161,20 @@ class Params:
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params': def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
config = json.load(open(config_path)) config = json.load(open(config_path))
n_vocab = config["vocab_size"] n_vocab = config["vocab_size"]
n_embd = config["hidden_size"] n_embd = config["hidden_size"]
n_layer = config["num_hidden_layers"] n_layer = config["num_hidden_layers"]
n_ff = config["intermediate_size"] n_ff = config["intermediate_size"]
n_head = config["num_attention_heads"] n_head = config["num_attention_heads"]
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
f_norm_eps = config["rms_norm_eps"] f_norm_eps = config["rms_norm_eps"]
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
rope_scaling = config.get("rope_scaling")
if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
f_rope_scale = config["rope_scaling"].get("factor")
else:
f_rope_scale = None
n_mult = Params.find_n_mult(n_ff, n_embd) n_mult = Params.find_n_mult(n_ff, n_embd)
@ -179,15 +187,17 @@ class Params:
"Suggestion: provide 'config.json' of the model in the same directory containing model files.") "Suggestion: provide 'config.json' of the model in the same directory containing model files.")
return Params( return Params(
n_vocab = n_vocab, n_vocab = n_vocab,
n_embd = n_embd, n_embd = n_embd,
n_mult = n_mult, n_mult = n_mult,
n_layer = n_layer, n_layer = n_layer,
n_ctx = n_ctx, n_ctx = n_ctx,
n_ff = n_ff, n_ff = n_ff,
n_head = n_head, n_head = n_head,
n_head_kv = n_head_kv, n_head_kv = n_head_kv,
f_norm_eps = f_norm_eps, f_norm_eps = f_norm_eps,
f_rope_freq_base = f_rope_freq_base,
f_rope_scale = f_rope_scale,
) )
# LLaMA v2 70B params.json # LLaMA v2 70B params.json
@ -771,6 +781,9 @@ class OutputFile:
if params.f_rope_freq_base: if params.f_rope_freq_base:
self.gguf.add_rope_freq_base(params.f_rope_freq_base) self.gguf.add_rope_freq_base(params.f_rope_freq_base)
if params.f_rope_scale:
self.gguf.add_rope_scale_linear(params.f_rope_scale)
if params.ftype: if params.ftype:
self.gguf.add_file_type(params.ftype) self.gguf.add_file_type(params.ftype)

View file

@ -25,6 +25,7 @@ else()
add_subdirectory(simple) add_subdirectory(simple)
add_subdirectory(embd-input) add_subdirectory(embd-input)
add_subdirectory(llama-bench) add_subdirectory(llama-bench)
add_subdirectory(beam_search)
if (LLAMA_METAL) if (LLAMA_METAL)
add_subdirectory(metal) add_subdirectory(metal)
endif() endif()

View file

@ -0,0 +1,8 @@
set(TARGET beam_search)
add_executable(${TARGET} beam_search.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()

View file

@ -0,0 +1,188 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
#include "common.h"
#include "llama.h"
#include "build-info.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif
// Used for debugging to print out beam tokens.
struct ostream_beam_view {
llama_context * ctx;
llama_beam_view beam_view;
};
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
}
return os << ')';
}
// Put here anything you want back in beam_search_callback().
struct beam_search_callback_data {
llama_context * ctx;
std::vector<llama_token> response;
};
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
}
// Function matching type llama_beam_search_callback_fn_t.
// Custom callback example is called each time the beams lengths increase:
// * Show progress by printing ',' following by number of convergent beam tokens if any.
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
// Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
llama_beam_view& beam_view = beams_state.beam_views[i];
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
beam_view.eob = true;
}
}
printf(","); // Show progress
if (const size_t n = beams_state.common_prefix_length) {
callback_data.response.resize(callback_data.response.size() + n);
assert(0u < beams_state.n_beams);
const llama_token * tokens = beams_state.beam_views[0].tokens;
std::copy(tokens, tokens + n, callback_data.response.end() - n);
printf("%lu", n);
}
fflush(stdout);
#if 1 // DEBUG: print current beams for this iteration
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
}
#endif
}
int main(int argc, char ** argv)
{
gpt_params params;
//params.n_gpu_layers = 200;
//---------------------------------
// Print help :
//---------------------------------
if ( argc < 2 || argv[1][0] == '-' )
{
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
return 1 ;
}
//---------------------------------
// Load parameters :
//---------------------------------
params.model = argv[1];
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
if ( argc > 3 )
{
params.prompt = argv[3];
}
if ( params.prompt.empty() )
{
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
}
//---------------------------------
// Init LLM :
//---------------------------------
llama_backend_init(params.numa);
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params( params );
if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1;
}
//---------------------------------
// Tokenize the prompt :
//---------------------------------
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
const size_t max_context_size = llama_n_ctx( ctx );
const size_t max_tokens_list_size = max_context_size - 4 ;
if (tokens_list.size() > max_tokens_list_size)
{
fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
__func__ , tokens_list.size() , max_tokens_list_size );
return 1;
}
fprintf( stderr, "\n\n" );
// Print the tokens from the prompt :
for( auto id : tokens_list )
{
std::cout << llama_token_to_str(ctx, id);
}
std::cout << std::flush;
int n_past = llama_get_kv_cache_token_count(ctx);
if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
{
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
return 1;
}
n_past += tokens_list.size();
beam_search_callback_data callback_data{ctx, {}};
size_t const beam_width = static_cast<size_t>(params.n_beams);
int const n_predict = 256;
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
std::cout << "\n\n";
for (llama_token const token_id : callback_data.response) {
std::cout << llama_token_to_str(ctx,token_id);
}
std::cout << std::endl;
llama_free( ctx );
llama_free_model( model );
llama_backend_free();
return 0;
}

View file

@ -18,9 +18,7 @@
#include "llama.h" #include "llama.h"
#include "common.h" #include "common.h"
#include "build-info.h" #include "build-info.h"
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h" #include "ggml-cuda.h"
#endif
// utils // utils
static uint64_t get_time_ns() { static uint64_t get_time_ns() {
@ -443,6 +441,8 @@ struct test {
static const std::string gpu_info; static const std::string gpu_info;
std::string model_filename; std::string model_filename;
std::string model_type; std::string model_type;
uint64_t model_size;
uint64_t model_n_params;
int n_batch; int n_batch;
int n_threads; int n_threads;
bool f32_kv; bool f32_kv;
@ -459,8 +459,10 @@ struct test {
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) { test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
model_filename = inst.model; model_filename = inst.model;
char buf[128]; char buf[128];
llama_model_type(lmodel, buf, sizeof(buf)); llama_model_desc(lmodel, buf, sizeof(buf));
model_type = buf; model_type = buf;
model_size = llama_model_size(lmodel);
model_n_params = llama_model_n_params(lmodel);
n_batch = inst.n_batch; n_batch = inst.n_batch;
n_threads = inst.n_threads; n_threads = inst.n_threads;
f32_kv = inst.f32_kv; f32_kv = inst.f32_kv;
@ -504,7 +506,7 @@ struct test {
static std::string get_backend() { static std::string get_backend() {
if (cuda) { if (cuda) {
return "CUDA"; return GGML_CUDA_NAME;
} }
if (opencl) { if (opencl) {
return "OpenCL"; return "OpenCL";
@ -526,7 +528,7 @@ struct test {
"build_commit", "build_number", "build_commit", "build_number",
"cuda", "opencl", "metal", "gpu_blas", "blas", "cuda", "opencl", "metal", "gpu_blas", "blas",
"cpu_info", "gpu_info", "cpu_info", "gpu_info",
"model_filename", "model_type", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_threads", "f16_kv", "n_batch", "n_threads", "f16_kv",
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split", "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
"n_prompt", "n_gen", "test_time", "n_prompt", "n_gen", "test_time",
@ -540,6 +542,7 @@ struct test {
static field_type get_field_type(const std::string & field) { static field_type get_field_type(const std::string & field) {
if (field == "build_number" || field == "n_batch" || field == "n_threads" || if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
field == "model_size" || field == "model_n_params" ||
field == "n_gpu_layers" || field == "main_gpu" || field == "n_gpu_layers" || field == "main_gpu" ||
field == "n_prompt" || field == "n_gen" || field == "n_prompt" || field == "n_gen" ||
field == "avg_ns" || field == "stddev_ns") { field == "avg_ns" || field == "stddev_ns") {
@ -575,7 +578,7 @@ struct test {
build_commit, std::to_string(build_number), build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas), std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info, cpu_info, gpu_info,
model_filename, model_type, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv), std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str, std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(n_prompt), std::to_string(n_gen), test_time,
@ -711,8 +714,15 @@ struct markdown_printer : public printer {
return -30; return -30;
} }
if (field == "t/s") { if (field == "t/s") {
return 15; return 16;
} }
if (field == "size" || field == "params") {
return 10;
}
if (field == "n_gpu_layers") {
return 3;
}
int width = std::max((int)field.length(), 10); int width = std::max((int)field.length(), 10);
if (test::get_field_type(field) == test::STRING) { if (test::get_field_type(field) == test::STRING) {
@ -721,9 +731,28 @@ struct markdown_printer : public printer {
return width; return width;
} }
static std::string get_field_display_name(const std::string & field) {
if (field == "n_gpu_layers") {
return "ngl";
}
if (field == "n_threads") {
return "threads";
}
if (field == "mul_mat_q") {
return "mmq";
}
if (field == "tensor_split") {
return "ts";
}
return field;
}
void print_header(const cmd_params & params) override { void print_header(const cmd_params & params) override {
// select fields to print // select fields to print
fields = { "model", "backend" }; fields.push_back("model");
fields.push_back("size");
fields.push_back("params");
fields.push_back("backend");
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS"; bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
if (!is_cpu_backend) { if (!is_cpu_backend) {
fields.push_back("n_gpu_layers"); fields.push_back("n_gpu_layers");
@ -754,7 +783,7 @@ struct markdown_printer : public printer {
fprintf(fout, "|"); fprintf(fout, "|");
for (const auto & field : fields) { for (const auto & field : fields) {
fprintf(fout, " %*s |", get_field_width(field), field.c_str()); fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
} }
fprintf(fout, "\n"); fprintf(fout, "\n");
fprintf(fout, "|"); fprintf(fout, "|");
@ -771,12 +800,26 @@ struct markdown_printer : public printer {
fprintf(fout, "|"); fprintf(fout, "|");
for (const auto & field : fields) { for (const auto & field : fields) {
std::string value; std::string value;
char buf[128];
if (field == "model") { if (field == "model") {
value = t.model_type; value = t.model_type;
} else if (field == "size") {
if (t.model_size < 1024*1024*1024) {
snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
} else {
snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
}
value = buf;
} else if (field == "params") {
if (t.model_n_params < 1000*1000*1000) {
snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
} else {
snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
}
value = buf;
} else if (field == "backend") { } else if (field == "backend") {
value = test::get_backend(); value = test::get_backend();
} else if (field == "test") { } else if (field == "test") {
char buf[128];
if (t.n_prompt > 0 && t.n_gen == 0) { if (t.n_prompt > 0 && t.n_gen == 0) {
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt); snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
} else if (t.n_gen > 0 && t.n_prompt == 0) { } else if (t.n_gen > 0 && t.n_prompt == 0) {
@ -787,7 +830,6 @@ struct markdown_printer : public printer {
} }
value = buf; value = buf;
} else if (field == "t/s") { } else if (field == "t/s") {
char buf[128];
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts()); snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
value = buf; value = buf;
} else if (vmap.find(field) != vmap.end()) { } else if (vmap.find(field) != vmap.end()) {

View file

@ -201,13 +201,20 @@ int main(int argc, char ** argv) {
} }
} }
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; // Add BOS if SPM tokenizer
const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
// tokenize the prompt // tokenize the prompt
LOG("Tokenize the prompt\n") LOG("Tokenize the prompt\n")
std::vector<llama_token> embd_inp; std::vector<llama_token> embd_inp;
if (llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM) {
// Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' ');
}
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) { if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm); embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
} else { } else {
embd_inp = session_tokens; embd_inp = session_tokens;
} }
@ -229,11 +236,13 @@ int main(int argc, char ** argv) {
if (ctx_guidance) { if (ctx_guidance) {
params.cfg_negative_prompt.insert(0, 1, ' '); params.cfg_negative_prompt.insert(0, 1, ' ');
LOG("cfg_negative_prompt: \"%s\"\n", LOG_TOSTR(params.cfg_negative_prompt)) LOG("cfg_negative_prompt: \"%s\"\n", LOG_TOSTR(params.cfg_negative_prompt))
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(guidance_inp,ctx_guidance)) LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(guidance_inp,ctx_guidance))
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm); std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(original_inp,ctx)) LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(original_inp,ctx))
original_prompt_len = original_inp.size(); original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len; guidance_offset = (int)guidance_inp.size() - original_prompt_len;
LOG("original_prompt_len: %s", LOG_TOSTR(original_prompt_len)) LOG("original_prompt_len: %s", LOG_TOSTR(original_prompt_len))
@ -290,7 +299,7 @@ int main(int argc, char ** argv) {
} }
// prefix & suffix for instruct mode // prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm); const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false); const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
// in instruct mode, we inject a prefix and a suffix to each input by the user // in instruct mode, we inject a prefix and a suffix to each input by the user

View file

@ -6,6 +6,8 @@
#include <ctime> #include <ctime>
#include <sstream> #include <sstream>
#include <cstring> #include <cstring>
#include <thread>
#include <mutex>
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
@ -27,6 +29,40 @@ std::vector<float> softmax(const std::vector<float>& logits) {
return probs; return probs;
} }
float log_softmax(int n_vocab, const float * logits, int tok) {
float max_logit = logits[0];
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
double sum_exp = 0.0;
for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
return logits[tok] - max_logit - log(sum_exp);
}
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread>& workers,
double& nll, double& nll2) {
std::mutex mutex;
int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () {
double local_nll = 0, local_nll2 = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
int i = counter++;
if (i >= n_token) {
nll += local_nll; nll2 += local_nll2;
break;
}
lock.unlock();
double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
local_nll += v;
local_nll2 += v*v;
}
};
for (auto& w : workers) w = std::thread(compute);
compute();
for (auto& w : workers) w.join();
}
void perplexity_v2(llama_context * ctx, const gpt_params & params) { void perplexity_v2(llama_context * ctx, const gpt_params & params) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
@ -166,9 +202,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
int count = 0; int count = 0;
double nll = 0.0; double nll = 0.0;
double nll2 = 0.0;
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch); fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
for (int i = 0; i < n_chunk; ++i) { for (int i = 0; i < n_chunk; ++i) {
const int start = i * params.n_ctx; const int start = i * params.n_ctx;
const int end = start + params.n_ctx; const int end = start + params.n_ctx;
@ -228,26 +267,32 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
// Example, we have a context window of 512, we will compute perplexity for each of the // Example, we have a context window of 512, we will compute perplexity for each of the
// last 256 tokens. Then, we split the input up into context window size chunks to // last 256 tokens. Then, we split the input up into context window size chunks to
// process the entire prompt. // process the entire prompt.
for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) { const int first = std::min(512, params.n_ctx/2);
// Calculate probability of next token, given the previous ones. process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2);
const std::vector<float> tok_logits( count += params.n_ctx - first - 1;
logits.begin() + (j + 0) * n_vocab,
logits.begin() + (j + 1) * n_vocab);
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
nll += -std::log(prob);
++count;
}
// perplexity is e^(average negative log-likelihood) // perplexity is e^(average negative log-likelihood)
if (params.ppl_output_type == 0) { if (params.ppl_output_type == 0) {
printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
} else { } else {
printf("%8d %.4lf\n", i*params.n_ctx, std::exp(nll / count)); double av = nll/count;
double av2 = nll2/count - av*av;
if (av2 > 0) av2 = sqrt(av2/(count-1));
printf("%8d %.4lf %4lf %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2);
} }
fflush(stdout); fflush(stdout);
} }
printf("\n"); printf("\n");
nll2 /= count;
nll /= count;
nll2 -= nll * nll;
if (nll2 > 0) {
nll2 = sqrt(nll2/(count-1));
double ppl = exp(nll);
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
} else {
printf("Unexpected negative standard deviation of log(prob)\n");
}
} }
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
@ -306,6 +351,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count); fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM; const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
// This is needed as usual for LLaMA models // This is needed as usual for LLaMA models
const bool add_bos = is_spm; const bool add_bos = is_spm;
@ -361,6 +407,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
double acc = 0.0f; double acc = 0.0f;
const int n_vocab = llama_n_vocab(ctx); const int n_vocab = llama_n_vocab(ctx);
std::vector<std::vector<int>> ending_tokens(4);
std::vector<float> tok_logits(n_vocab); std::vector<float> tok_logits(n_vocab);
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) { for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
@ -368,11 +416,21 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos); std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
size_t context_size = context_embd.size(); size_t context_size = context_embd.size();
for (int i = 0; i < 4; ++i) {
ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[i], add_bos);
for (int k = 0; k < int(context_size); ++k) {
if (ending_tokens[i][k] != context_embd[k]) {
fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
break;
}
}
}
// Do the 1st ending // Do the 1st ending
// In this case we include the context when evaluating // In this case we include the context when evaluating
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos); //auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
auto query_embd = ending_tokens[0];
auto query_size = query_embd.size(); auto query_size = query_embd.size();
//printf("First query: %d\n",(int)query_size);
// Stop if query wont fit the ctx window // Stop if query wont fit the ctx window
if (query_size > (size_t)params.n_ctx) { if (query_size > (size_t)params.n_ctx) {
@ -417,7 +475,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) { for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
// Tokenize the query // Tokenize the query
query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false); query_embd.resize(ending_tokens[ending_idx].size() - context_size);
std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int));
query_size = query_embd.size(); query_size = query_embd.size();
// Stop if query wont fit the ctx window // Stop if query wont fit the ctx window

View file

@ -77,34 +77,31 @@ You need to have [Node.js](https://nodejs.org/en) installed.
```bash ```bash
mkdir llama-client mkdir llama-client
cd llama-client cd llama-client
npm init
npm install axios
``` ```
Create a index.js file and put inside this: Create a index.js file and put inside this:
```javascript ```javascript
const axios = require("axios");
const prompt = `Building a website can be done in 10 simple steps:`; const prompt = `Building a website can be done in 10 simple steps:`;
async function Test() { async function Test() {
let result = await axios.post("http://127.0.0.1:8080/completion", { let response = await fetch("http://127.0.0.1:8080/completion", {
prompt, method: 'POST',
n_predict: 512, body: JSON.stringify({
}); prompt,
n_predict: 512,
// the response is received until completion finish })
console.log(result.data.content); })
console.log((await response.json()).content)
} }
Test(); Test()
``` ```
And run it: And run it:
```bash ```bash
node . node index.js
``` ```
## API Endpoints ## API Endpoints

File diff suppressed because it is too large Load diff

View file

@ -102,6 +102,17 @@
padding: 0.5em; padding: 0.5em;
} }
.prob-set {
padding: 0.3em;
border-bottom: 1px solid #ccc;
}
.popover-content {
position: absolute;
background-color: white;
padding: 0.2em;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
}
textarea { textarea {
padding: 5px; padding: 5px;
@ -133,11 +144,17 @@
font-size: 80%; font-size: 80%;
color: #888; color: #888;
} }
@media (prefers-color-scheme: dark) {
.popover-content {
background-color: black;
}
}
</style> </style>
<script type="module"> <script type="module">
import { import {
html, h, signal, effect, computed, render, useSignal, useEffect, useRef html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
} from '/index.js'; } from '/index.js';
import { llama } from '/completion.js'; import { llama } from '/completion.js';
@ -168,6 +185,7 @@
mirostat_tau: 5, // target entropy mirostat_tau: 5, // target entropy
mirostat_eta: 0.1, // learning rate mirostat_eta: 0.1, // learning rate
grammar: '', grammar: '',
n_probs: 0, // no completion_probabilities
}) })
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */ /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
@ -334,10 +352,21 @@
const prompt = template(session.value.template, { const prompt = template(session.value.template, {
message: msg, message: msg,
history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"), history: session.value.transcript.flatMap(
([name, data]) =>
template(
session.value.historyTemplate,
{
name,
message: Array.isArray(data) ?
data.map(msg => msg.content).join('').replace(/^\s/, '') :
data,
}
)
).join("\n"),
}); });
let currentMessage = ''; const currentMessages = [];
const history = session.value.transcript const history = session.value.transcript
const llamaParams = { const llamaParams = {
@ -347,15 +376,19 @@
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) { for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
const data = chunk.data; const data = chunk.data;
currentMessage += data.content;
// remove leading whitespace
currentMessage = currentMessage.replace(/^\s+/, "")
transcriptUpdate([...history, ["{{char}}", currentMessage]])
if (data.stop) { if (data.stop) {
console.log("Completion finished: '", currentMessage, "', summary: ", data); while (
currentMessages.length > 0 &&
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
) {
currentMessages.pop();
}
transcriptUpdate([...history, ["{{char}}", currentMessages]])
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
} else {
currentMessages.push(data);
transcriptUpdate([...history, ["{{char}}", currentMessages]])
} }
if (data.timings) { if (data.timings) {
@ -420,8 +453,18 @@
} }
}, [messages]) }, [messages])
const chatLine = ([user, msg]) => { const chatLine = ([user, data], index) => {
return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdownish} text=${template(msg)} /></p>` let message
const isArrayMessage = Array.isArray(data)
if (params.value.n_probs > 0 && isArrayMessage) {
message = html`<${Probabilities} data=${data} />`
} else {
const text = isArrayMessage ?
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
data;
message = html`<${Markdownish} text=${template(text)} />`
}
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
}; };
return html` return html`
@ -568,10 +611,71 @@
${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})} ${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})} ${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
</fieldset> </fieldset>
<fieldset>
${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
</fieldset>
</details> </details>
</form> </form>
` `
} }
const probColor = (p) => {
const r = Math.floor(192 * (1 - p));
const g = Math.floor(192 * p);
return `rgba(${r},${g},0,0.3)`;
}
const Probabilities = (params) => {
return params.data.map(msg => {
const { completion_probabilities } = msg;
if (
!completion_probabilities ||
completion_probabilities.length === 0
) return msg.content
if (completion_probabilities.length > 1) {
// Not for byte pair
if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
const splitData = completion_probabilities.map(prob => ({
content: prob.content,
completion_probabilities: [prob]
}))
return html`<${Probabilities} data=${splitData} />`
}
const { probs, content } = completion_probabilities[0]
const found = probs.find(p => p.tok_str === msg.content)
const pColor = found ? probColor(found.prob) : 'transparent'
const popoverChildren = html`
<div class="prob-set">
${probs.map((p, index) => {
return html`
<div
key=${index}
title=${`prob: ${p.prob}`}
style=${{
padding: '0.3em',
backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
}}
>
<span>${p.tok_str}: </span>
<span>${Math.floor(p.prob * 100)}%</span>
</div>
`
})}
</div>
`
return html`
<${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
</>
`
});
}
// poor mans markdown replacement // poor mans markdown replacement
const Markdownish = (params) => { const Markdownish = (params) => {
const md = params.text const md = params.text
@ -600,10 +704,121 @@
` `
} }
// simple popover impl
const Popover = (props) => {
const isOpen = useSignal(false);
const position = useSignal({ top: '0px', left: '0px' });
const buttonRef = useRef(null);
const popoverRef = useRef(null);
const togglePopover = () => {
if (buttonRef.current) {
const rect = buttonRef.current.getBoundingClientRect();
position.value = {
top: `${rect.bottom + window.scrollY}px`,
left: `${rect.left + window.scrollX}px`,
};
}
isOpen.value = !isOpen.value;
};
const handleClickOutside = (event) => {
if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
isOpen.value = false;
}
};
useEffect(() => {
document.addEventListener('mousedown', handleClickOutside);
return () => {
document.removeEventListener('mousedown', handleClickOutside);
};
}, []);
return html`
<span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
${isOpen.value && html`
<${Portal} into="#portal">
<div
ref=${popoverRef}
class="popover-content"
style=${{
top: position.value.top,
left: position.value.left,
}}
>
${props.popoverChildren}
</div>
</${Portal}>
`}
`;
};
// Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
/** Redirect rendering of descendants into the given CSS selector */
class Portal extends Component {
componentDidUpdate(props) {
for (let i in props) {
if (props[i] !== this.props[i]) {
return setTimeout(this.renderLayer);
}
}
}
componentDidMount() {
this.isMounted = true;
this.renderLayer = this.renderLayer.bind(this);
this.renderLayer();
}
componentWillUnmount() {
this.renderLayer(false);
this.isMounted = false;
if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
}
findNode(node) {
return typeof node === 'string' ? document.querySelector(node) : node;
}
renderLayer(show = true) {
if (!this.isMounted) return;
// clean up old node if moving bases:
if (this.props.into !== this.intoPointer) {
this.intoPointer = this.props.into;
if (this.into && this.remote) {
this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
}
this.into = this.findNode(this.props.into);
}
this.remote = render(html`
<${PortalProxy} context=${this.context}>
${show && this.props.children || null}
</${PortalProxy}>
`, this.into, this.remote);
}
render() {
return null;
}
}
// high-order component that renders its first child if it exists.
// used as a conditional rendering proxy.
class PortalProxy extends Component {
getChildContext() {
return this.props.context;
}
render({ children }) {
return children || null;
}
}
function App(props) { function App(props) {
return html` return html`
<div id="container"> <div>
<header> <header>
<h1>llama.cpp</h1> <h1>llama.cpp</h1>
</header> </header>
@ -624,11 +839,13 @@
`; `;
} }
render(h(App), document.body); render(h(App), document.querySelector('#container'));
</script> </script>
</head> </head>
<body> <body>
<div id="container"></div>
<div id="portal"></div>
</body> </body>
</html> </html>

View file

@ -124,8 +124,9 @@ static void server_log(const char *level, const char *function, int line,
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
{ {
std::string out = token == -1 ? "" : llama_token_to_str(ctx, token); std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
// if first bit is 1, meaning it's a partial character // if the size is 1 and first bit is 1, meaning it's a partial character
if (out.size() > 0 && (out[0] & 0x80) == 0x80) // (size > 1 meaning it's already a known token)
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
{ {
std::stringstream ss; std::stringstream ss;
ss << std::hex << (out[0] & 0xff); ss << std::hex << (out[0] & 0xff);
@ -1208,6 +1209,62 @@ static void log_server_request(const Request &req, const Response &res)
}); });
} }
bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
}
// Function matching type llama_beam_search_callback_fn_t.
// Custom callback example is called each time the beams lengths increase:
// * Show progress by printing ',' following by number of convergent beam tokens if any.
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
auto & llama = *static_cast<llama_server_context*>(callback_data);
// Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
llama_beam_view& beam_view = beams_state.beam_views[i];
if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
beam_view.eob = true;
}
}
printf(","); // Show progress
if (const size_t n = beams_state.common_prefix_length) {
llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
assert(0u < beams_state.n_beams);
const llama_token * tokens = beams_state.beam_views[0].tokens;
const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
printf("%lu", n);
}
fflush(stdout);
#if 0 // DEBUG: print current beams for this iteration
std::cout << "\n\nCurrent beams:\n";
for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
}
#endif
}
struct token_translator {
llama_context * ctx;
std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); }
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
};
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
auto & gtps = llama.generated_token_probs;
auto translator = token_translator{llama.ctx};
auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
llama.generated_text.reserve(llama.generated_text.size() + len);
}
for (const completion_token_output & cto : gtps) {
llama.generated_text += translator(cto);
}
}
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
// own arguments required by this example // own arguments required by this example
@ -1290,22 +1347,30 @@ int main(int argc, char **argv)
llama.beginCompletion(); llama.beginCompletion();
if (!llama.stream) { if (!llama.stream) {
size_t stop_pos = std::string::npos; if (llama.params.n_beams) {
// Fill llama.generated_token_probs vector with final beam.
llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
llama.n_past, llama.n_remain, llama.params.n_threads);
// Translate llama.generated_token_probs to llama.generated_text.
append_to_generated_text_from_generated_token_probs(llama);
} else {
size_t stop_pos = std::string::npos;
while (llama.has_next_token) { while (llama.has_next_token) {
const completion_token_output token_with_probs = llama.doCompletion(); const completion_token_output token_with_probs = llama.doCompletion();
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok); const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
stop_pos = llama.findStoppingStrings(llama.generated_text, stop_pos = llama.findStoppingStrings(llama.generated_text,
token_text.size(), STOP_FULL); token_text.size(), STOP_FULL);
} }
if (stop_pos == std::string::npos) { if (stop_pos == std::string::npos) {
stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL); stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
} }
if (stop_pos != std::string::npos) { if (stop_pos != std::string::npos) {
llama.generated_text.erase(llama.generated_text.begin() + stop_pos, llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
llama.generated_text.end()); llama.generated_text.end());
}
} }
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs); const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
@ -1321,59 +1386,86 @@ int main(int argc, char **argv)
while (llama.has_next_token) { while (llama.has_next_token) {
const completion_token_output token_with_probs = llama.doCompletion(); const completion_token_output token_with_probs = llama.doCompletion();
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok); if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
if (llama.multibyte_pending > 0) {
continue; continue;
} }
const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok);
size_t pos = std::min(sent_count, llama.generated_text.size()); size_t pos = std::min(sent_count, llama.generated_text.size());
const std::string str_test = llama.generated_text.substr(pos); const std::string str_test = llama.generated_text.substr(pos);
bool is_stop_full = false;
size_t stop_pos = size_t stop_pos =
llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL); llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
if (stop_pos != std::string::npos) { if (stop_pos != std::string::npos) {
is_stop_full = true;
llama.generated_text.erase( llama.generated_text.erase(
llama.generated_text.begin() + pos + stop_pos, llama.generated_text.begin() + pos + stop_pos,
llama.generated_text.end()); llama.generated_text.end());
pos = std::min(sent_count, llama.generated_text.size()); pos = std::min(sent_count, llama.generated_text.size());
} else { } else {
is_stop_full = false;
stop_pos = llama.findStoppingStrings(str_test, token_text.size(), stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
STOP_PARTIAL); STOP_PARTIAL);
} }
const std::string to_send = llama.generated_text.substr(pos, stop_pos); if (
sent_count += to_send.size(); stop_pos == std::string::npos ||
// Send rest of the text if we are at the end of the generation
(!llama.has_next_token && !is_stop_full && stop_pos > 0)
) {
const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
std::vector<completion_token_output> probs_output = {}; sent_count += to_send.size();
if (llama.params.n_probs > 0) { std::vector<completion_token_output> probs_output = {};
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); if (llama.params.n_probs > 0) {
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
if (probs_pos < probs_stop_pos) { size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos); size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
if (probs_pos < probs_stop_pos) {
probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
}
sent_token_probs_index = probs_stop_pos;
}
const json data = format_partial_response(llama, to_send, probs_output);
const std::string str =
"data: " +
data.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n";
LOG_VERBOSE("data stream", {
{ "to_send", str }
});
if (!sink.write(str.data(), str.size())) {
LOG_VERBOSE("stream closed", {});
llama_print_timings(llama.ctx);
return false;
} }
sent_token_probs_index = probs_stop_pos;
} }
const json data = llama.has_next_token if (!llama.has_next_token) {
? format_partial_response(llama, to_send, probs_output) // Generation is done, send extra information.
// Generation is done, send extra information. const json data = format_final_response(llama, "", llama.generated_token_probs);
: format_final_response(llama, to_send, llama.generated_token_probs);
const std::string str = const std::string str =
"data: " + "data: " +
data.dump(-1, ' ', false, json::error_handler_t::replace) + data.dump(-1, ' ', false, json::error_handler_t::replace) +
"\n\n"; "\n\n";
LOG_VERBOSE("data stream", { LOG_VERBOSE("data stream", {
{ "to_send", str } { "to_send", str }
}); });
if (!sink.write(str.data(), str.size())) { if (!sink.write(str.data(), str.size())) {
LOG_VERBOSE("stream closed", {}); LOG_VERBOSE("stream closed", {});
llama_print_timings(llama.ctx); llama_print_timings(llama.ctx);
return false; return false;
}
} }
} }

View file

@ -21,6 +21,12 @@
CoreGraphics CoreGraphics
CoreVideo CoreVideo
] ]
else if isDarwin then
with pkgs.darwin.apple_sdk.frameworks; [
Accelerate
CoreGraphics
CoreVideo
]
else else
with pkgs; [ openblas ] with pkgs; [ openblas ]
); );
@ -80,8 +86,13 @@
type = "app"; type = "app";
program = "${self.packages.${system}.default}/bin/llama"; program = "${self.packages.${system}.default}/bin/llama";
}; };
apps.quantize = {
type = "app";
program = "${self.packages.${system}.default}/bin/quantize";
};
apps.default = self.apps.${system}.llama; apps.default = self.apps.${system}.llama;
devShells.default = pkgs.mkShell { devShells.default = pkgs.mkShell {
buildInputs = [ llama-python ];
packages = nativeBuildInputs ++ osSpecific; packages = nativeBuildInputs ++ osSpecific;
}; };
}); });

View file

@ -8,6 +8,7 @@
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
#define MAX(a, b) ((a) > (b) ? (a) : (b)) #define MAX(a, b) ((a) > (b) ? (a) : (b))
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
//#define GGML_ALLOCATOR_DEBUG //#define GGML_ALLOCATOR_DEBUG
@ -67,7 +68,7 @@ struct ggml_allocr {
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE]; struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
size_t max_size; size_t max_size;
bool measure; bool measure;
int parse_seq[GGML_MAX_NODES]; int parse_seq[GGML_MAX_CONCUR];
int parse_seq_len; int parse_seq_len;
#ifdef GGML_ALLOCATOR_DEBUG #ifdef GGML_ALLOCATOR_DEBUG

View file

@ -6,15 +6,116 @@
#include <atomic> #include <atomic>
#include <assert.h> #include <assert.h>
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
#define CUBLAS_OP_N HIPBLAS_OP_N
#define CUBLAS_OP_T HIPBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH 0
#define CUDA_R_16F HIPBLAS_R_16F
#define CUDA_R_32F HIPBLAS_R_32F
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasCreate hipblasCreate
#define cublasGemmEx hipblasGemmEx
#define cublasHandle_t hipblasHandle_t
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventRecord hipEventRecord
#define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree
#define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#define cudaMemcpy hipMemcpy
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#else
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#endif
#include "ggml-cuda.h" #include "ggml-cuda.h"
#include "ggml.h" #include "ggml.h"
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#ifndef CC_TURING
#define CC_TURING 700 #define CC_TURING 700
#endif
#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
return reinterpret_cast<const int&>(c);
}
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
c = __builtin_amdgcn_sdot4(a, b, c, false);
#elif defined(__gfx1100__)
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
#elif defined(__gfx1010__) || defined(__gfx900__)
int tmp1;
int tmp2;
asm("\n \
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
v_add3_u32 %0, %1, %2, %0 \n \
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
v_add3_u32 %0, %1, %2, %0 \n \
"
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
: "v"(a), "v"(b)
);
#else
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
#endif
return c;
}
#endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data #pragma warning(disable: 4244 4267) // possible loss of data
@ -424,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
const block_q4_1 * x = (const block_q4_1 *) vx; const block_q4_1 * x = (const block_q4_1 *) vx;
const dfloat d = x[ib].dm.x; const dfloat d = __low2half(x[ib].dm);
const dfloat m = x[ib].dm.y; const dfloat m = __high2half(x[ib].dm);
const int vui = x[ib].qs[iqs]; const int vui = x[ib].qs[iqs];
@ -467,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
const block_q5_1 * x = (const block_q5_1 *) vx; const block_q5_1 * x = (const block_q5_1 *) vx;
const dfloat d = x[ib].dm.x; const dfloat d = __low2half(x[ib].dm);
const dfloat m = x[ib].dm.y; const dfloat m = __high2half(x[ib].dm);
uint32_t qh; uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh)); memcpy(&qh, x[ib].qh, sizeof(qh));
@ -520,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
const uint8_t q = x[i].qs[32*n + l]; const uint8_t q = x[i].qs[32*n + l];
float * y = yy + i*QK_K + 128*n; float * y = yy + i*QK_K + 128*n;
float dall = x[i].dm.x; float dall = __low2half(x[i].dm);
float dmin = x[i].dm.y; float dmin = __high2half(x[i].dm);
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@ -531,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
const int il = tid%16; // 0...15 const int il = tid%16; // 0...15
const uint8_t q = x[i].qs[il] >> (2*is); const uint8_t q = x[i].qs[il] >> (2*is);
float * y = yy + i*QK_K + 16*is + il; float * y = yy + i*QK_K + 16*is + il;
float dall = x[i].dm.x; float dall = __low2half(x[i].dm);
float dmin = x[i].dm.y; float dmin = __high2half(x[i].dm);
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
#endif #endif
@ -618,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
float * y = yy + i*QK_K + 64*il + n*ir; float * y = yy + i*QK_K + 64*il + n*ir;
const float dall = x[i].dm.x; const float dall = __low2half(x[i].dm);
const float dmin = x[i].dm.y; const float dmin = __high2half(x[i].dm);
const uint8_t * q = x[i].qs + 32*il + n*ir; const uint8_t * q = x[i].qs + 32*il + n*ir;
@ -657,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
float * y = yy + i*QK_K + 64*il + 2*ir; float * y = yy + i*QK_K + 64*il + 2*ir;
const float dall = x[i].dm.x; const float dall = __low2half(x[i].dm);
const float dmin = x[i].dm.y; const float dmin = __high2half(x[i].dm);
const uint8_t * ql = x[i].qs + 32*il + 2*ir; const uint8_t * ql = x[i].qs + 32*il + 2*ir;
const uint8_t * qh = x[i].qh + 2*ir; const uint8_t * qh = x[i].qh + 2*ir;
@ -770,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
const float * y = yy + i * QK_K + y_offset; const float * y = yy + i * QK_K + y_offset;
const uint8_t * q = x[i].qs + q_offset; const uint8_t * q = x[i].qs + q_offset;
const float dall = x[i].dm.x; const float dall = __low2half(x[i].dm);
const float dmin = x[i].dm.y; const float dmin = __high2half(x[i].dm);
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
aux[0] = a[0] & 0x0f0f0f0f; aux[0] = a[0] & 0x0f0f0f0f;
@ -991,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
const float * y1 = yy + i*QK_K + y_offset; const float * y1 = yy + i*QK_K + y_offset;
const float * y2 = y1 + 128; const float * y2 = y1 + 128;
const float dall = x[i].dm.x; const float dall = __low2half(x[i].dm);
const float dmin = x[i].dm.y; const float dmin = __high2half(x[i].dm);
const uint16_t * a = (const uint16_t *)x[i].scales; const uint16_t * a = (const uint16_t *)x[i].scales;
aux[0] = a[im+0] & kmask1; aux[0] = a[im+0] & kmask1;
@ -1124,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
const float * y1 = yy + i*QK_K + y_offset; const float * y1 = yy + i*QK_K + y_offset;
const float * y2 = y1 + 128; const float * y2 = y1 + 128;
const float dall = x[i].dm.x; const float dall = __low2half(x[i].dm);
const float dmin = x[i].dm.y; const float dmin = __high2half(x[i].dm);
const uint16_t * a = (const uint16_t *)x[i].scales; const uint16_t * a = (const uint16_t *)x[i].scales;
aux[0] = a[im+0] & kmask1; aux[0] = a[im+0] & kmask1;
@ -1348,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
return; return;
} }
y[ib].ds.x = d; reinterpret_cast<half&>(y[ib].ds.x) = d;
y[ib].ds.y = sum; reinterpret_cast<half&>(y[ib].ds.y) = sum;
} }
template <int qk, int qr, dequantize_kernel_t dequantize_kernel> template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@ -2346,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
} }
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x); return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
} }
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@ -2432,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
#pragma unroll #pragma unroll
for (int i = 0; i < QR2_K; ++ i) { for (int i = 0; i < QR2_K; ++ i) {
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
d8[i] = bq8_1[bq8_offset + i].ds.x; d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
} }
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
@ -2551,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
#pragma unroll #pragma unroll
for (int i = 0; i < QR3_K; ++i) { for (int i = 0; i < QR3_K; ++i) {
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
d8[i] = bq8_1[bq8_offset + i].ds.x; d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
} }
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
@ -2720,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
for (int i = 0; i < QR4_K; ++i) { for (int i = 0; i < QR4_K; ++i) {
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
d8[i] = bq8i->ds.x; d8[i] = __low2half(bq8i->ds);
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
u[2*i+0] = q8[0]; u[2*i+0] = q8[0];
@ -2747,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
const float dall = bq4_K->d[0]; const float dall = bq4_K->d[0];
const float dmin = bq4_K->d[1]; const float dmin = bq4_K->d[1];
const float d8_1 = bq8_1[0].ds.x; const float d8_1 = __low2float(bq8_1[0].ds);
const float d8_2 = bq8_1[1].ds.x; const float d8_2 = __low2float(bq8_1[1].ds);
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@ -2901,7 +3002,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
#pragma unroll #pragma unroll
for (int i = 0; i < QR5_K; ++i) { for (int i = 0; i < QR5_K; ++i) {
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
d8[i] = bq8i->ds.x; d8[i] = __low2float(bq8i->ds);
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
u[2*i+0] = q8[0]; u[2*i+0] = q8[0];
@ -2919,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
const float d = bq5_K->d; const float d = bq5_K->d;
const float d8_1 = bq8_1[0].ds.x; const float d8_1 = __low2half(bq8_1[0].ds);
const float d8_2 = bq8_1[1].ds.x; const float d8_2 = __low2half(bq8_1[1].ds);
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
@ -3075,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
#pragma unroll #pragma unroll
for (int i = 0; i < QR6_K; ++i) { for (int i = 0; i < QR6_K; ++i) {
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
d8[i] = bq8_1[bq8_offset + 2*i].ds.x; d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
} }
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
@ -3243,7 +3344,7 @@ static __device__ __forceinline__ void mul_mat_q(
*dsi_dst = *dsi_src; *dsi_dst = *dsi_src;
} else { } else {
float * dfi_dst = (float *) dsi_dst; float * dfi_dst = (float *) dsi_dst;
*dfi_dst = (*dsi_src).x; *dfi_dst = __low2half(*dsi_src);
} }
} }
@ -3907,28 +4008,27 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
dst[i + 1] = x0*sin_theta + x1*cos_theta; dst[i + 1] = x0*sin_theta + x1*cos_theta;
} }
// TODO: this implementation is wrong! static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0, const float p_delta, const int p_delta_rows, const float theta_scale) {
// const float p_delta, const int p_delta_rows, const float theta_scale) { const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
// if (col >= ncols) {
// if (col >= ncols) { return;
// return; }
// }
// const int row = blockDim.x*blockIdx.x + threadIdx.x;
// const int row = blockDim.x*blockIdx.x + threadIdx.x; const int i = row*ncols + col/2;
// const int i = row*ncols + col/2;
// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2); const float sin_theta = sinf(theta);
// const float sin_theta = sinf(theta); const float cos_theta = cosf(theta);
// const float cos_theta = cosf(theta);
// const float x0 = x[i + 0];
// const float x0 = x[i + 0]; const float x1 = x[i + ncols/2];
// const float x1 = x[i + ncols/2];
// dst[i + 0] = x0*cos_theta - x1*sin_theta;
// dst[i + 0] = x0*cos_theta - x1*sin_theta; dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta; }
//}
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) { static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
const int col = blockDim.x*blockIdx.x + threadIdx.x; const int col = blockDim.x*blockIdx.x + threadIdx.x;
@ -4799,13 +4899,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0, static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
GGML_ASSERT(nrows % 2 == 0); GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1); const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nrows, num_blocks_x, 1); const dim3 block_nums(nrows, num_blocks_x, 1);
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale); rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
} }
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
const dim3 block_nums(nrows, num_blocks_x, 1);
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
}
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) { static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
GGML_ASSERT(nrows % 4 == 0); GGML_ASSERT(nrows % 4 == 0);
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1); const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
@ -4937,10 +5045,18 @@ void ggml_init_cublas() {
static bool initialized = false; static bool initialized = false;
if (!initialized) { if (!initialized) {
#ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards:
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
rocblas_initialize();
CUDA_CHECK(cudaDeviceSynchronize());
#endif
CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0; int64_t total_vram = 0;
fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count); fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
for (int id = 0; id < g_device_count; ++id) { for (int id = 0; id < g_device_count; ++id) {
cudaDeviceProp prop; cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
@ -5548,8 +5664,9 @@ inline void ggml_cuda_op_rope(
const float block_p = max(p - (n_ctx - 2.f), 0.f); const float block_p = max(p - (n_ctx - 2.f), 0.f);
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main); rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
} else if (is_neox) { } else if (is_neox) {
GGML_ASSERT(false && "RoPE NeoX not implemented yet"); GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
#pragma message("TODO: implement RoPE NeoX for CUDA") const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
} else { } else {
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale; const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main); rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);

View file

@ -2,6 +2,14 @@
#include "ggml.h" #include "ggml.h"
#ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS"
#else
#define GGML_CUDA_NAME "CUDA"
#define GGML_CUBLAS_NAME "cuBLAS"
#endif
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif

21
gguf-py/LICENSE Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 Georgi Gerganov
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

55
gguf-py/README.md Normal file
View file

@ -0,0 +1,55 @@
## gguf
This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
(GGML Universal File) format.
See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py)
as an example for its usage.
## Installation
```sh
pip install gguf
```
## Development
Maintainers who participate in development of this package are advised to install it in editable mode:
```sh
cd /path/to/llama.cpp/gguf-py
pip install --editable .
```
**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`.
In this case, upgrade Pip to the latest:
```sh
pip install --upgrade pip
```
## Publishing
To publish the package, you need to have `twine` and `build` installed:
```sh
pip install build twine
```
Then, folow these steps to release a new version:
1. Update the version in `pyproject.toml`.
2. Build the package:
```sh
python -m build
```
3. Upload the generated distribution archives:
```sh
python -m twine upload dist/*
```
## TODO
- [ ] Add tests
- [ ] Include conversion scripts as command line entry points in this package.
- Add CI workflow for releasing the package.

1
gguf-py/gguf/__init__.py Normal file
View file

@ -0,0 +1 @@
from .gguf import *

0
gguf.py → gguf-py/gguf/gguf.py Executable file → Normal file
View file

28
gguf-py/pyproject.toml Normal file
View file

@ -0,0 +1,28 @@
[tool.poetry]
name = "gguf"
version = "0.2.1"
description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [
{include = "gguf"},
]
readme = "README.md"
homepage = "https://ggml.ai"
repository = "https://github.com/ggerganov/llama.cpp"
keywords = ["ggml", "gguf", "llama.cpp"]
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
[tool.poetry.dependencies]
python = ">=3.8"
numpy = ">=1.17"
[tool.poetry.dev-dependencies]
pytest = "^5.2"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

View file

@ -0,0 +1,7 @@
import gguf
# TODO: add tests
def test_write_gguf():
pass

362
llama.cpp
View file

@ -1635,7 +1635,7 @@ static void llm_load_hparams(
} }
// TODO: This should probably be in llama.h // TODO: This should probably be in llama.h
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape); static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos);
static void llm_load_vocab( static void llm_load_vocab(
llama_model_loader & ml, llama_model_loader & ml,
@ -1737,7 +1737,7 @@ static void llm_load_vocab(
} }
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n' // determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0]; vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
// special tokens // special tokens
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID)); GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
@ -1836,7 +1836,7 @@ static void llm_load_tensors(
(void) main_gpu; (void) main_gpu;
(void) mul_mat_q; (void) mul_mat_q;
#if defined(GGML_USE_CUBLAS) #if defined(GGML_USE_CUBLAS)
LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__); LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
ggml_cuda_set_main_device(main_gpu); ggml_cuda_set_main_device(main_gpu);
ggml_cuda_set_mul_mat_q(mul_mat_q); ggml_cuda_set_mul_mat_q(mul_mat_q);
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU #define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
@ -1958,6 +1958,14 @@ static void llm_load_tensors(
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
vram_weights += ggml_nbytes(model.output_norm);
vram_weights += ggml_nbytes(model.output_norm_b);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
vram_weights += ggml_nbytes(model.output);
}
} }
const uint32_t n_ff = hparams.n_ff; const uint32_t n_ff = hparams.n_ff;
@ -1967,7 +1975,7 @@ static void llm_load_tensors(
model.layers.resize(n_layer); model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) { for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
auto & layer = model.layers[i]; auto & layer = model.layers[i];
@ -1978,6 +1986,11 @@ static void llm_load_tensors(
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend); layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend); layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
if (backend == GGML_BACKEND_GPU) {
vram_weights += ggml_nbytes(layer.attn_norm_2);
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
}
} }
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
@ -1985,6 +1998,13 @@ static void llm_load_tensors(
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
}
} }
} break; } break;
default: default:
@ -3007,14 +3027,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
} }
static std::string llama_escape_whitespace(const std::string& text) { static std::string llama_escape_whitespace(const std::string& text) {
std::string result = "\xe2\x96\x81"; std::string result = text;
for (size_t offs = 0; offs < text.length(); ++offs) { replace_all(result, " ", "\xe2\x96\x81");
if (text[offs] == ' ') {
result += "\xe2\x96\x81";
} else {
result += text[offs];
}
}
return result; return result;
} }
@ -3199,7 +3213,7 @@ struct llm_bigram_bpe {
}; };
struct llm_tokenizer_bpe { struct llm_tokenizer_bpe {
llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; } llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) { void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
int final_prev_index = -1; int final_prev_index = -1;
@ -3351,8 +3365,6 @@ private:
return words; return words;
} }
bool flag_g2ws = false;
const llama_vocab & vocab; const llama_vocab & vocab;
std::vector<llm_symbol> symbols; std::vector<llm_symbol> symbols;
@ -3361,39 +3373,26 @@ private:
llm_bigram_bpe::queue work_queue; llm_bigram_bpe::queue work_queue;
}; };
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) { static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos) {
std::vector<llama_vocab::id> output; std::vector<llama_vocab::id> output;
if (raw_text.empty()) { if (raw_text.empty()) {
return output; return output;
} }
if (bos && vocab.special_bos_id != -1) {
output.push_back(vocab.special_bos_id);
}
switch (vocab.type) { switch (vocab.type) {
case LLAMA_VOCAB_TYPE_SPM: case LLAMA_VOCAB_TYPE_SPM:
{ {
llm_tokenizer_spm tokenizer(vocab); llm_tokenizer_spm tokenizer(vocab);
tokenizer.tokenize(llama_escape_whitespace(raw_text), output);
if (bos) {
output.push_back(vocab.special_bos_id);
}
std::string text;
if (escape) {
text = llama_escape_whitespace(raw_text);
} else {
text = raw_text;
}
tokenizer.tokenize(text, output);
} break; } break;
case LLAMA_VOCAB_TYPE_BPE: case LLAMA_VOCAB_TYPE_BPE:
{ {
llm_tokenizer_bpe tokenizer(vocab, escape); llm_tokenizer_bpe tokenizer(vocab);
if (bos && vocab.special_bos_id != -1) {
output.push_back(vocab.special_bos_id);
}
tokenizer.tokenize(raw_text, output); tokenizer.tokenize(raw_text, output);
} break; } break;
}; };
@ -4306,6 +4305,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
ctx->t_sample_us += ggml_time_us() - t_start_sample_us; ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
} }
//
// Beam search
//
struct llama_beam {
std::vector<llama_token> tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
// Sort beams by probability. In case of ties, prefer beams at eob.
bool operator<(const llama_beam & rhs) const {
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
}
// Shift off first n tokens and discard them.
void shift_tokens(const size_t n) {
if (n) {
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
tokens.resize(tokens.size() - n);
}
}
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
};
// A struct for calculating logit-related info.
struct llama_logit_info {
const float * const logits;
const int n_vocab;
const float max_l;
const float normalizer;
struct sum_exp {
float max_l;
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
};
llama_logit_info(llama_context * ctx)
: logits(llama_get_logits(ctx))
, n_vocab(llama_n_vocab(ctx))
, max_l(*std::max_element(logits, logits + n_vocab))
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
{ }
llama_token_data get_token_data(const llama_token token_id) const {
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
return {token_id, logits[token_id], p};
}
// Return top k token_data by logit.
std::vector<llama_token_data> top_k(size_t k) {
std::vector<llama_token_data> min_heap; // min-heap by logit
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
min_heap.reserve(k_min);
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
min_heap.push_back(get_token_data(token_id));
}
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
std::make_heap(min_heap.begin(), min_heap.end(), comp);
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
if (min_heap.front().logit < logits[token_id]) {
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
min_heap.back().id = token_id;
min_heap.back().logit = logits[token_id];
std::push_heap(min_heap.begin(), min_heap.end(), comp);
}
}
return min_heap;
}
float probability_from_logit(float logit) {
return normalizer * std::exp(logit - max_l);
}
};
struct llama_beam_search_data {
llama_context * ctx;
size_t n_beams;
int n_past;
int n_predict;
int n_threads;
std::vector<llama_beam> beams;
std::vector<llama_beam> next_beams;
// Re-calculated on each loop iteration
size_t common_prefix_length;
// Used to communicate to/from callback on beams state.
std::vector<llama_beam_view> beam_views;
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
: ctx(ctx)
, n_beams(n_beams)
, n_past(n_past)
, n_predict(n_predict)
, n_threads(n_threads)
, beam_views(n_beams) {
beams.reserve(n_beams);
next_beams.reserve(n_beams);
}
// Collapse beams to a single beam given by index.
void collapse_beams(const size_t beam_idx) {
if (0u < beam_idx) {
std::swap(beams[0], beams[beam_idx]);
}
beams.resize(1);
}
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
// The repetative patterns below reflect the 2 stages of heaps:
// * Gather elements until the vector is full, then call std::make_heap() on it.
// * If the heap is full and a new element is found that should be included, pop the
// least element to the back(), replace it with the new, then push it into the heap.
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
// Min-heaps use a greater-than comparator.
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
if (beam.eob) {
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
if (next_beams.size() < n_beams) {
next_beams.push_back(std::move(beam));
if (next_beams.size() == n_beams) {
std::make_heap(next_beams.begin(), next_beams.end(), comp);
}
} else if (next_beams.front().p < beam.p) {
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
next_beams.back() = std::move(beam);
std::push_heap(next_beams.begin(), next_beams.end(), comp);
}
} else {
// beam is not at end-of-sentence, so branch with next top_k tokens.
if (!beam.tokens.empty()) {
llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
}
llama_logit_info logit_info(ctx);
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
size_t i=0;
if (next_beams.size() < n_beams) {
for (; next_beams.size() < n_beams ; ++i) {
llama_beam next_beam = beam;
next_beam.tokens.push_back(next_tokens[i].id);
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
next_beams.push_back(std::move(next_beam));
}
std::make_heap(next_beams.begin(), next_beams.end(), comp);
} else {
for (; next_beams.front().p == 0.0f ; ++i) {
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
next_beams.back() = beam;
next_beams.back().tokens.push_back(next_tokens[i].id);
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
std::push_heap(next_beams.begin(), next_beams.end(), comp);
}
}
for (; i < n_beams ; ++i) {
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
if (next_beams.front().p < next_p) {
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
next_beams.back() = beam;
next_beams.back().tokens.push_back(next_tokens[i].id);
next_beams.back().p = next_p;
std::push_heap(next_beams.begin(), next_beams.end(), comp);
}
}
}
}
// Find common_prefix_length based on beams.
// Requires beams is not empty.
size_t find_common_prefix_length() {
size_t common_prefix_length = beams[0].tokens.size();
for (size_t i = 1 ; i < beams.size() ; ++i) {
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
if (beams[0].tokens[j] != beams[i].tokens[j]) {
common_prefix_length = j;
break;
}
}
}
return common_prefix_length;
}
// Construct beams_state to send back to caller via the callback function.
// Side effect: set common_prefix_length = find_common_prefix_length();
llama_beams_state get_beams_state(const bool last_call) {
for (size_t i = 0 ; i < beams.size() ; ++i) {
beam_views[i] = beams[i].view();
}
common_prefix_length = find_common_prefix_length();
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
}
// Loop:
// * while i < n_predict, AND
// * any of the beams have not yet reached end-of-beam (eob), AND
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
// (since all other beam probabilities can only decrease)
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
!beams[top_beam_index()].eob ; ++i) {
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
if (common_prefix_length) {
llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
n_past += common_prefix_length;
}
// Zero-out next_beam probabilities to place them last in following min-heap.
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
for (llama_beam & beam : beams) {
beam.shift_tokens(common_prefix_length);
fill_next_beams_by_top_probabilities(beam);
}
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
beams.swap(next_beams);
renormalize_beam_probabilities(beams);
}
collapse_beams(top_beam_index());
callback(callback_data, get_beams_state(true));
}
// As beams grow, the cumulative probabilities decrease.
// Renormalize them to avoid floating point underflow.
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
}
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
size_t top_beam_index() {
return std::max_element(beams.begin(), beams.end()) - beams.begin();
}
// Copy (p,eob) for each beam which may have been changed by the callback.
void update_beams_from_beam_views() {
for (size_t i = 0 ; i < beams.size() ; ++i) {
beams[i].p = beam_views[i].p;
beams[i].eob = beam_views[i].eob;
}
}
};
void llama_beam_search(llama_context * ctx,
llama_beam_search_callback_fn_t callback, void * callback_data,
size_t n_beams, int n_past, int n_predict, int n_threads) {
assert(ctx);
const int64_t t_start_sample_us = ggml_time_us();
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
beam_search_data.loop(callback, callback_data);
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
ctx->n_sample++;
}
// //
// quantization // quantization
// //
@ -4403,6 +4653,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false)); std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
llama_model model;
llm_load_arch(*ml, model);
llm_load_hparams(*ml, model, 0, 0, 0);
const size_t align = GGUF_DEFAULT_ALIGNMENT; const size_t align = GGUF_DEFAULT_ALIGNMENT;
struct gguf_context * ctx_out = gguf_init_empty(); struct gguf_context * ctx_out = gguf_init_empty();
@ -4428,6 +4682,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
++n_feed_forward_w2; ++n_feed_forward_w2;
} }
} }
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
}
int i_attention_wv = 0; int i_attention_wv = 0;
int i_feed_forward_w2 = 0; int i_feed_forward_w2 = 0;
@ -4504,8 +4762,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
int nx = tensor->ne[0]; int nx = tensor->ne[0];
int ny = tensor->ne[1]; if (nx % QK_K == 0) {
if (nx % QK_K == 0 && ny % QK_K == 0) {
new_type = GGML_TYPE_Q6_K; new_type = GGML_TYPE_Q6_K;
} }
} else if (name.find("attn_v.weight") != std::string::npos) { } else if (name.find("attn_v.weight") != std::string::npos) {
@ -4519,6 +4776,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) && else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K; (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
if (model.type == MODEL_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
}
++i_attention_wv; ++i_attention_wv;
} else if (name.find("ffn_down.weight") != std::string::npos) { } else if (name.find("ffn_down.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
@ -4548,8 +4811,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
int nx = tensor->ne[0]; int nx = tensor->ne[0];
int ny = tensor->ne[1]; int ny = tensor->ne[1];
if (nx % QK_K != 0 || ny % QK_K != 0) { if (nx % QK_K != 0) {
LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
convert_incompatible_tensor = true; convert_incompatible_tensor = true;
} }
} }
@ -5277,13 +5540,29 @@ int llama_model_n_embd(const struct llama_model * model) {
return model->hparams.n_embd; return model->hparams.n_embd;
} }
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) { int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
return snprintf(buf, buf_size, "%s %s %s", return snprintf(buf, buf_size, "%s %s %s",
model->name.c_str(), model->name.c_str(),
llama_model_type_name(model->type), llama_model_type_name(model->type),
llama_model_ftype_name(model->ftype).c_str()); llama_model_ftype_name(model->ftype).c_str());
} }
uint64_t llama_model_size(const struct llama_model * model) {
uint64_t size = 0;
for (const auto & it : model->tensors_by_name) {
size += ggml_nbytes(it.second);
}
return size;
}
uint64_t llama_model_n_params(const struct llama_model * model) {
uint64_t nparams = 0;
for (const auto & it : model->tensors_by_name) {
nparams += ggml_nelements(it.second);
}
return nparams;
}
int llama_model_quantize( int llama_model_quantize(
const char * fname_inp, const char * fname_inp,
const char * fname_out, const char * fname_out,
@ -5808,8 +6087,7 @@ int llama_tokenize_with_model(
llama_token * tokens, llama_token * tokens,
int n_max_tokens, int n_max_tokens,
bool add_bos) { bool add_bos) {
auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM; auto res = llama_tokenize_internal(model->vocab, text, add_bos);
auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
if (n_max_tokens < (int) res.size()) { if (n_max_tokens < (int) res.size()) {
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);

43
llama.h
View file

@ -254,7 +254,11 @@ extern "C" {
LLAMA_API int llama_model_n_embd (const struct llama_model * model); LLAMA_API int llama_model_n_embd (const struct llama_model * model);
// Get a string describing the model type // Get a string describing the model type
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size); LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
// Returns the total size of all the tensors in the model in bytes
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
// Returns the total number of parameters in the model
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
// Returns 0 on success // Returns 0 on success
LLAMA_API int llama_model_quantize( LLAMA_API int llama_model_quantize(
@ -465,6 +469,43 @@ extern "C" {
/// @details Accepts the sampled token into the grammar /// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token); LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
//
// Beam search
//
struct llama_beam_view {
const llama_token * tokens;
size_t n_tokens;
float p; // Cumulative beam probability (renormalized relative to all beams)
bool eob; // Callback should set this to true when a beam is at end-of-beam.
};
// Passed to beam_search_callback function.
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
// These pointers are valid only during the synchronous callback, so should not be saved.
struct llama_beams_state {
struct llama_beam_view * beam_views;
size_t n_beams; // Number of elements in beam_views[].
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
bool last_call; // True iff this is the last callback invocation.
};
// Type of pointer to the beam_search_callback function.
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, llama_beams_state);
/// @details Deterministically returns entire sentence constructed by a beam search.
/// @param ctx Pointer to the llama_context.
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
/// @param callback_data A pointer that is simply passed back to callback.
/// @param n_beams Number of beams to use.
/// @param n_past Number of tokens already evaluated.
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
/// @param n_threads Number of threads as passed to llama_eval().
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
// Performance information // Performance information
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
LLAMA_API void llama_print_timings(struct llama_context * ctx); LLAMA_API void llama_print_timings(struct llama_context * ctx);

View file

@ -1,2 +1,3 @@
numpy==1.24 numpy==1.24
sentencepiece==0.1.98 sentencepiece==0.1.98
gguf>=0.1.0

View file

@ -100,7 +100,8 @@ int main(int argc, char **argv) {
bool success = true; bool success = true;
for (const auto & test_kv : k_tests()) { for (const auto & test_kv : k_tests()) {
std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true); // Add a space in front of the first character to match OG llama tokenizer behavior
std::vector<llama_token> res = llama_tokenize(ctx, " " + test_kv.first, true);
fprintf(stderr, "%s : '%s' tokenized to '%s'\n", fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str()); __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());