Merge branch 'master' into betterlogs
This commit is contained in:
commit
5031c50e48
35 changed files with 3291 additions and 1358 deletions
44
.devops/full-rocm.Dockerfile
Normal file
44
.devops/full-rocm.Dockerfile
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=5.6
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
ARG ROCM_DOCKER_ARCH=\
|
||||||
|
gfx803 \
|
||||||
|
gfx900 \
|
||||||
|
gfx906 \
|
||||||
|
gfx908 \
|
||||||
|
gfx90a \
|
||||||
|
gfx1010 \
|
||||||
|
gfx1030 \
|
||||||
|
gfx1100 \
|
||||||
|
gfx1101 \
|
||||||
|
gfx1102
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
ENV LLAMA_HIPBLAS=1
|
||||||
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
44
.devops/main-rocm.Dockerfile
Normal file
44
.devops/main-rocm.Dockerfile
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG ROCM_VERSION=5.6
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||||
|
|
||||||
|
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||||
|
# This is mostly tied to rocBLAS supported archs.
|
||||||
|
ARG ROCM_DOCKER_ARCH=\
|
||||||
|
gfx803 \
|
||||||
|
gfx900 \
|
||||||
|
gfx906 \
|
||||||
|
gfx908 \
|
||||||
|
gfx90a \
|
||||||
|
gfx1010 \
|
||||||
|
gfx1030 \
|
||||||
|
gfx1100 \
|
||||||
|
gfx1101 \
|
||||||
|
gfx1102
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
|
# Enable ROCm
|
||||||
|
ENV LLAMA_HIPBLAS=1
|
||||||
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/app/main" ]
|
|
@ -5,14 +5,7 @@
|
||||||
.vscode/
|
.vscode/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
|
||||||
build/
|
build*/
|
||||||
build-em/
|
|
||||||
build-debug/
|
|
||||||
build-release/
|
|
||||||
build-static/
|
|
||||||
build-no-accel/
|
|
||||||
build-sanitize-addr/
|
|
||||||
build-sanitize-thread/
|
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
|
|
||||||
|
|
17
.gitignore
vendored
17
.gitignore
vendored
|
@ -17,20 +17,7 @@
|
||||||
.vs/
|
.vs/
|
||||||
.vscode/
|
.vscode/
|
||||||
|
|
||||||
build/
|
build*/
|
||||||
build-em/
|
|
||||||
build-debug/
|
|
||||||
build-release/
|
|
||||||
build-ci-debug/
|
|
||||||
build-ci-release/
|
|
||||||
build-static/
|
|
||||||
build-cublas/
|
|
||||||
build-opencl/
|
|
||||||
build-metal/
|
|
||||||
build-mpi/
|
|
||||||
build-no-accel/
|
|
||||||
build-sanitize-addr/
|
|
||||||
build-sanitize-thread/
|
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
|
|
||||||
|
@ -61,6 +48,7 @@ compile_commands.json
|
||||||
CMakeSettings.json
|
CMakeSettings.json
|
||||||
|
|
||||||
__pycache__
|
__pycache__
|
||||||
|
dist
|
||||||
|
|
||||||
zig-out/
|
zig-out/
|
||||||
zig-cache/
|
zig-cache/
|
||||||
|
@ -71,7 +59,6 @@ perf-*.txt
|
||||||
|
|
||||||
examples/jeopardy/results.txt
|
examples/jeopardy/results.txt
|
||||||
|
|
||||||
pyproject.toml
|
|
||||||
poetry.lock
|
poetry.lock
|
||||||
poetry.toml
|
poetry.toml
|
||||||
|
|
||||||
|
|
|
@ -74,6 +74,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
|
||||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||||
|
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||||
option(LLAMA_METAL "llama: use Metal" OFF)
|
option(LLAMA_METAL "llama: use Metal" OFF)
|
||||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||||
|
@ -352,6 +353,43 @@ if (LLAMA_CLBLAST)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (LLAMA_HIPBLAS)
|
||||||
|
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
||||||
|
|
||||||
|
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
||||||
|
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
|
||||||
|
endif()
|
||||||
|
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||||
|
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
find_package(hip)
|
||||||
|
find_package(hipblas)
|
||||||
|
find_package(rocblas)
|
||||||
|
|
||||||
|
if (${hipblas_FOUND} AND ${hip_FOUND})
|
||||||
|
message(STATUS "HIP and hipBLAS found")
|
||||||
|
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
|
||||||
|
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
|
||||||
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
||||||
|
endif()
|
||||||
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
|
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
|
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||||
|
target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
|
||||||
|
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
|
||||||
|
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||||
|
|
||||||
|
if (LLAMA_STATIC)
|
||||||
|
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
||||||
|
endif()
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
|
||||||
|
else()
|
||||||
|
message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_ALL_WARNINGS)
|
if (LLAMA_ALL_WARNINGS)
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
set(c_flags
|
set(c_flags
|
||||||
|
|
24
Makefile
24
Makefile
|
@ -280,6 +280,30 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
endif # LLAMA_CLBLAST
|
endif # LLAMA_CLBLAST
|
||||||
|
|
||||||
|
ifdef LLAMA_HIPBLAS
|
||||||
|
ROCM_PATH ?= /opt/rocm
|
||||||
|
HIPCC ?= $(ROCM_PATH)/bin/hipcc
|
||||||
|
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
||||||
|
LLAMA_CUDA_DMMV_X ?= 32
|
||||||
|
LLAMA_CUDA_MMV_Y ?= 1
|
||||||
|
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
||||||
|
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
|
||||||
|
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
|
||||||
|
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
||||||
|
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
||||||
|
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
|
||||||
|
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
|
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||||
|
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||||
|
HIPFLAGS += -DCC_TURING=1000000000
|
||||||
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||||
|
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||||
|
endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
|
OBJS += ggml-cuda.o
|
||||||
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
|
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
||||||
|
endif # LLAMA_HIPBLAS
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef LLAMA_METAL
|
||||||
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
||||||
CXXFLAGS += -DGGML_USE_METAL
|
CXXFLAGS += -DGGML_USE_METAL
|
||||||
|
|
29
README.md
29
README.md
|
@ -422,6 +422,35 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
|
||||||
|
- #### hipBLAS
|
||||||
|
|
||||||
|
This provide BLAS acceleation on HIP supported GPU like AMD GPU.
|
||||||
|
Make sure to have ROCm installed.
|
||||||
|
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
|
||||||
|
Windows support is coming soon...
|
||||||
|
|
||||||
|
- Using `make`:
|
||||||
|
```bash
|
||||||
|
make LLAMA_HIPBLAS=1
|
||||||
|
```
|
||||||
|
- Using `CMake`:
|
||||||
|
```bash
|
||||||
|
mkdir build
|
||||||
|
cd build
|
||||||
|
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
|
||||||
|
cmake --build .
|
||||||
|
```
|
||||||
|
|
||||||
|
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
||||||
|
If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
|
||||||
|
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
||||||
|
|
||||||
|
| Option | Legal values | Default | Description |
|
||||||
|
|-------------------------|------------------------|---------|-------------|
|
||||||
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||||
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
|
||||||
- #### CLBlast
|
- #### CLBlast
|
||||||
|
|
||||||
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
|
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
|
||||||
|
|
|
@ -391,6 +391,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||||
|
|
||||||
python3 -m pip install -r ${SRC}/requirements.txt
|
python3 -m pip install -r ${SRC}/requirements.txt
|
||||||
|
python3 -m pip install --editable gguf-py
|
||||||
fi
|
fi
|
||||||
|
|
||||||
ret=0
|
ret=0
|
||||||
|
|
|
@ -635,9 +635,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
|
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
|
||||||
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
|
fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
|
||||||
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
|
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
#endif
|
#endif
|
||||||
fprintf(stdout, " --mtest compute maximum memory usage\n");
|
fprintf(stdout, " --mtest compute maximum memory usage\n");
|
||||||
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
|
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
|
||||||
|
|
|
@ -30,6 +30,7 @@ struct gpt_params {
|
||||||
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
|
||||||
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
|
||||||
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
|
||||||
|
int32_t n_beams = 0; // if non-zero then use beam search of given width.
|
||||||
float rope_freq_base = 10000.0f; // RoPE base frequency
|
float rope_freq_base = 10000.0f; // RoPE base frequency
|
||||||
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
|
||||||
|
|
||||||
|
|
45
convert.py
45
convert.py
|
@ -105,6 +105,7 @@ class Params:
|
||||||
f_norm_eps: float
|
f_norm_eps: float
|
||||||
|
|
||||||
f_rope_freq_base: Optional[float] = None
|
f_rope_freq_base: Optional[float] = None
|
||||||
|
f_rope_scale: Optional[float] = None
|
||||||
|
|
||||||
ftype: Optional[GGMLFileType] = None
|
ftype: Optional[GGMLFileType] = None
|
||||||
|
|
||||||
|
@ -160,13 +161,20 @@ class Params:
|
||||||
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
|
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
|
||||||
config = json.load(open(config_path))
|
config = json.load(open(config_path))
|
||||||
|
|
||||||
n_vocab = config["vocab_size"]
|
n_vocab = config["vocab_size"]
|
||||||
n_embd = config["hidden_size"]
|
n_embd = config["hidden_size"]
|
||||||
n_layer = config["num_hidden_layers"]
|
n_layer = config["num_hidden_layers"]
|
||||||
n_ff = config["intermediate_size"]
|
n_ff = config["intermediate_size"]
|
||||||
n_head = config["num_attention_heads"]
|
n_head = config["num_attention_heads"]
|
||||||
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
|
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
|
||||||
f_norm_eps = config["rms_norm_eps"]
|
f_norm_eps = config["rms_norm_eps"]
|
||||||
|
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
|
||||||
|
|
||||||
|
rope_scaling = config.get("rope_scaling")
|
||||||
|
if isinstance(rope_scaling, dict) and rope_scaling.get("type") == "linear":
|
||||||
|
f_rope_scale = config["rope_scaling"].get("factor")
|
||||||
|
else:
|
||||||
|
f_rope_scale = None
|
||||||
|
|
||||||
n_mult = Params.find_n_mult(n_ff, n_embd)
|
n_mult = Params.find_n_mult(n_ff, n_embd)
|
||||||
|
|
||||||
|
@ -179,15 +187,17 @@ class Params:
|
||||||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = n_vocab,
|
n_vocab = n_vocab,
|
||||||
n_embd = n_embd,
|
n_embd = n_embd,
|
||||||
n_mult = n_mult,
|
n_mult = n_mult,
|
||||||
n_layer = n_layer,
|
n_layer = n_layer,
|
||||||
n_ctx = n_ctx,
|
n_ctx = n_ctx,
|
||||||
n_ff = n_ff,
|
n_ff = n_ff,
|
||||||
n_head = n_head,
|
n_head = n_head,
|
||||||
n_head_kv = n_head_kv,
|
n_head_kv = n_head_kv,
|
||||||
f_norm_eps = f_norm_eps,
|
f_norm_eps = f_norm_eps,
|
||||||
|
f_rope_freq_base = f_rope_freq_base,
|
||||||
|
f_rope_scale = f_rope_scale,
|
||||||
)
|
)
|
||||||
|
|
||||||
# LLaMA v2 70B params.json
|
# LLaMA v2 70B params.json
|
||||||
|
@ -771,6 +781,9 @@ class OutputFile:
|
||||||
if params.f_rope_freq_base:
|
if params.f_rope_freq_base:
|
||||||
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
self.gguf.add_rope_freq_base(params.f_rope_freq_base)
|
||||||
|
|
||||||
|
if params.f_rope_scale:
|
||||||
|
self.gguf.add_rope_scale_linear(params.f_rope_scale)
|
||||||
|
|
||||||
if params.ftype:
|
if params.ftype:
|
||||||
self.gguf.add_file_type(params.ftype)
|
self.gguf.add_file_type(params.ftype)
|
||||||
|
|
||||||
|
|
|
@ -25,6 +25,7 @@ else()
|
||||||
add_subdirectory(simple)
|
add_subdirectory(simple)
|
||||||
add_subdirectory(embd-input)
|
add_subdirectory(embd-input)
|
||||||
add_subdirectory(llama-bench)
|
add_subdirectory(llama-bench)
|
||||||
|
add_subdirectory(beam_search)
|
||||||
if (LLAMA_METAL)
|
if (LLAMA_METAL)
|
||||||
add_subdirectory(metal)
|
add_subdirectory(metal)
|
||||||
endif()
|
endif()
|
||||||
|
|
8
examples/beam_search/CMakeLists.txt
Normal file
8
examples/beam_search/CMakeLists.txt
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
set(TARGET beam_search)
|
||||||
|
add_executable(${TARGET} beam_search.cpp)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
if(TARGET BUILD_INFO)
|
||||||
|
add_dependencies(${TARGET} BUILD_INFO)
|
||||||
|
endif()
|
188
examples/beam_search/beam_search.cpp
Normal file
188
examples/beam_search/beam_search.cpp
Normal file
|
@ -0,0 +1,188 @@
|
||||||
|
#ifndef _GNU_SOURCE
|
||||||
|
#define _GNU_SOURCE
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "build-info.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <ctime>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
|
||||||
|
#include <signal.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#elif defined (_WIN32)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#define NOMINMAX
|
||||||
|
#include <windows.h>
|
||||||
|
#include <signal.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Used for debugging to print out beam tokens.
|
||||||
|
struct ostream_beam_view {
|
||||||
|
llama_context * ctx;
|
||||||
|
llama_beam_view beam_view;
|
||||||
|
};
|
||||||
|
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
|
||||||
|
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
||||||
|
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
||||||
|
os << llama_token_to_str(obv.ctx, obv.beam_view.tokens[i]);
|
||||||
|
}
|
||||||
|
return os << ')';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Put here anything you want back in beam_search_callback().
|
||||||
|
struct beam_search_callback_data {
|
||||||
|
llama_context * ctx;
|
||||||
|
std::vector<llama_token> response;
|
||||||
|
};
|
||||||
|
|
||||||
|
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
||||||
|
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
||||||
|
bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
|
||||||
|
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function matching type llama_beam_search_callback_fn_t.
|
||||||
|
// Custom callback example is called each time the beams lengths increase:
|
||||||
|
// * Show progress by printing ',' following by number of convergent beam tokens if any.
|
||||||
|
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||||
|
// This is also called when the stop condition is met.
|
||||||
|
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||||
|
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
||||||
|
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
||||||
|
// Mark beams as EOS as needed.
|
||||||
|
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||||
|
llama_beam_view& beam_view = beams_state.beam_views[i];
|
||||||
|
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
|
||||||
|
beam_view.eob = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf(","); // Show progress
|
||||||
|
if (const size_t n = beams_state.common_prefix_length) {
|
||||||
|
callback_data.response.resize(callback_data.response.size() + n);
|
||||||
|
assert(0u < beams_state.n_beams);
|
||||||
|
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
||||||
|
std::copy(tokens, tokens + n, callback_data.response.end() - n);
|
||||||
|
printf("%lu", n);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
#if 1 // DEBUG: print current beams for this iteration
|
||||||
|
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
|
||||||
|
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||||
|
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv)
|
||||||
|
{
|
||||||
|
gpt_params params;
|
||||||
|
//params.n_gpu_layers = 200;
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Print help :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
if ( argc < 2 || argv[1][0] == '-' )
|
||||||
|
{
|
||||||
|
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
|
||||||
|
return 1 ;
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Load parameters :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
params.model = argv[1];
|
||||||
|
|
||||||
|
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
|
||||||
|
|
||||||
|
if ( argc > 3 )
|
||||||
|
{
|
||||||
|
params.prompt = argv[3];
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( params.prompt.empty() )
|
||||||
|
{
|
||||||
|
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Init LLM :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
llama_backend_init(params.numa);
|
||||||
|
|
||||||
|
llama_model * model;
|
||||||
|
llama_context * ctx;
|
||||||
|
|
||||||
|
std::tie(model, ctx) = llama_init_from_gpt_params( params );
|
||||||
|
|
||||||
|
if ( model == NULL )
|
||||||
|
{
|
||||||
|
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------
|
||||||
|
// Tokenize the prompt :
|
||||||
|
//---------------------------------
|
||||||
|
|
||||||
|
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
|
const size_t max_context_size = llama_n_ctx( ctx );
|
||||||
|
const size_t max_tokens_list_size = max_context_size - 4 ;
|
||||||
|
|
||||||
|
if (tokens_list.size() > max_tokens_list_size)
|
||||||
|
{
|
||||||
|
fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
|
||||||
|
__func__ , tokens_list.size() , max_tokens_list_size );
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf( stderr, "\n\n" );
|
||||||
|
|
||||||
|
// Print the tokens from the prompt :
|
||||||
|
|
||||||
|
for( auto id : tokens_list )
|
||||||
|
{
|
||||||
|
std::cout << llama_token_to_str(ctx, id);
|
||||||
|
}
|
||||||
|
std::cout << std::flush;
|
||||||
|
|
||||||
|
int n_past = llama_get_kv_cache_token_count(ctx);
|
||||||
|
if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
n_past += tokens_list.size();
|
||||||
|
|
||||||
|
beam_search_callback_data callback_data{ctx, {}};
|
||||||
|
size_t const beam_width = static_cast<size_t>(params.n_beams);
|
||||||
|
int const n_predict = 256;
|
||||||
|
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict, params.n_threads);
|
||||||
|
|
||||||
|
std::cout << "\n\n";
|
||||||
|
for (llama_token const token_id : callback_data.response) {
|
||||||
|
std::cout << llama_token_to_str(ctx,token_id);
|
||||||
|
}
|
||||||
|
std::cout << std::endl;
|
||||||
|
|
||||||
|
llama_free( ctx );
|
||||||
|
llama_free_model( model );
|
||||||
|
|
||||||
|
llama_backend_free();
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -18,9 +18,7 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
#include "build-info.h"
|
#include "build-info.h"
|
||||||
#ifdef GGML_USE_CUBLAS
|
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#endif
|
|
||||||
|
|
||||||
// utils
|
// utils
|
||||||
static uint64_t get_time_ns() {
|
static uint64_t get_time_ns() {
|
||||||
|
@ -443,6 +441,8 @@ struct test {
|
||||||
static const std::string gpu_info;
|
static const std::string gpu_info;
|
||||||
std::string model_filename;
|
std::string model_filename;
|
||||||
std::string model_type;
|
std::string model_type;
|
||||||
|
uint64_t model_size;
|
||||||
|
uint64_t model_n_params;
|
||||||
int n_batch;
|
int n_batch;
|
||||||
int n_threads;
|
int n_threads;
|
||||||
bool f32_kv;
|
bool f32_kv;
|
||||||
|
@ -459,8 +459,10 @@ struct test {
|
||||||
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
|
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
|
||||||
model_filename = inst.model;
|
model_filename = inst.model;
|
||||||
char buf[128];
|
char buf[128];
|
||||||
llama_model_type(lmodel, buf, sizeof(buf));
|
llama_model_desc(lmodel, buf, sizeof(buf));
|
||||||
model_type = buf;
|
model_type = buf;
|
||||||
|
model_size = llama_model_size(lmodel);
|
||||||
|
model_n_params = llama_model_n_params(lmodel);
|
||||||
n_batch = inst.n_batch;
|
n_batch = inst.n_batch;
|
||||||
n_threads = inst.n_threads;
|
n_threads = inst.n_threads;
|
||||||
f32_kv = inst.f32_kv;
|
f32_kv = inst.f32_kv;
|
||||||
|
@ -504,7 +506,7 @@ struct test {
|
||||||
|
|
||||||
static std::string get_backend() {
|
static std::string get_backend() {
|
||||||
if (cuda) {
|
if (cuda) {
|
||||||
return "CUDA";
|
return GGML_CUDA_NAME;
|
||||||
}
|
}
|
||||||
if (opencl) {
|
if (opencl) {
|
||||||
return "OpenCL";
|
return "OpenCL";
|
||||||
|
@ -526,7 +528,7 @@ struct test {
|
||||||
"build_commit", "build_number",
|
"build_commit", "build_number",
|
||||||
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
||||||
"cpu_info", "gpu_info",
|
"cpu_info", "gpu_info",
|
||||||
"model_filename", "model_type",
|
"model_filename", "model_type", "model_size", "model_n_params",
|
||||||
"n_batch", "n_threads", "f16_kv",
|
"n_batch", "n_threads", "f16_kv",
|
||||||
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
|
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
|
||||||
"n_prompt", "n_gen", "test_time",
|
"n_prompt", "n_gen", "test_time",
|
||||||
|
@ -540,6 +542,7 @@ struct test {
|
||||||
|
|
||||||
static field_type get_field_type(const std::string & field) {
|
static field_type get_field_type(const std::string & field) {
|
||||||
if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
|
if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
|
||||||
|
field == "model_size" || field == "model_n_params" ||
|
||||||
field == "n_gpu_layers" || field == "main_gpu" ||
|
field == "n_gpu_layers" || field == "main_gpu" ||
|
||||||
field == "n_prompt" || field == "n_gen" ||
|
field == "n_prompt" || field == "n_gen" ||
|
||||||
field == "avg_ns" || field == "stddev_ns") {
|
field == "avg_ns" || field == "stddev_ns") {
|
||||||
|
@ -575,7 +578,7 @@ struct test {
|
||||||
build_commit, std::to_string(build_number),
|
build_commit, std::to_string(build_number),
|
||||||
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||||
cpu_info, gpu_info,
|
cpu_info, gpu_info,
|
||||||
model_filename, model_type,
|
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||||
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
|
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
|
||||||
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
|
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
|
||||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||||
|
@ -711,8 +714,15 @@ struct markdown_printer : public printer {
|
||||||
return -30;
|
return -30;
|
||||||
}
|
}
|
||||||
if (field == "t/s") {
|
if (field == "t/s") {
|
||||||
return 15;
|
return 16;
|
||||||
}
|
}
|
||||||
|
if (field == "size" || field == "params") {
|
||||||
|
return 10;
|
||||||
|
}
|
||||||
|
if (field == "n_gpu_layers") {
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
int width = std::max((int)field.length(), 10);
|
int width = std::max((int)field.length(), 10);
|
||||||
|
|
||||||
if (test::get_field_type(field) == test::STRING) {
|
if (test::get_field_type(field) == test::STRING) {
|
||||||
|
@ -721,9 +731,28 @@ struct markdown_printer : public printer {
|
||||||
return width;
|
return width;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string get_field_display_name(const std::string & field) {
|
||||||
|
if (field == "n_gpu_layers") {
|
||||||
|
return "ngl";
|
||||||
|
}
|
||||||
|
if (field == "n_threads") {
|
||||||
|
return "threads";
|
||||||
|
}
|
||||||
|
if (field == "mul_mat_q") {
|
||||||
|
return "mmq";
|
||||||
|
}
|
||||||
|
if (field == "tensor_split") {
|
||||||
|
return "ts";
|
||||||
|
}
|
||||||
|
return field;
|
||||||
|
}
|
||||||
|
|
||||||
void print_header(const cmd_params & params) override {
|
void print_header(const cmd_params & params) override {
|
||||||
// select fields to print
|
// select fields to print
|
||||||
fields = { "model", "backend" };
|
fields.push_back("model");
|
||||||
|
fields.push_back("size");
|
||||||
|
fields.push_back("params");
|
||||||
|
fields.push_back("backend");
|
||||||
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
|
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
|
||||||
if (!is_cpu_backend) {
|
if (!is_cpu_backend) {
|
||||||
fields.push_back("n_gpu_layers");
|
fields.push_back("n_gpu_layers");
|
||||||
|
@ -754,7 +783,7 @@ struct markdown_printer : public printer {
|
||||||
|
|
||||||
fprintf(fout, "|");
|
fprintf(fout, "|");
|
||||||
for (const auto & field : fields) {
|
for (const auto & field : fields) {
|
||||||
fprintf(fout, " %*s |", get_field_width(field), field.c_str());
|
fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
|
||||||
}
|
}
|
||||||
fprintf(fout, "\n");
|
fprintf(fout, "\n");
|
||||||
fprintf(fout, "|");
|
fprintf(fout, "|");
|
||||||
|
@ -771,12 +800,26 @@ struct markdown_printer : public printer {
|
||||||
fprintf(fout, "|");
|
fprintf(fout, "|");
|
||||||
for (const auto & field : fields) {
|
for (const auto & field : fields) {
|
||||||
std::string value;
|
std::string value;
|
||||||
|
char buf[128];
|
||||||
if (field == "model") {
|
if (field == "model") {
|
||||||
value = t.model_type;
|
value = t.model_type;
|
||||||
|
} else if (field == "size") {
|
||||||
|
if (t.model_size < 1024*1024*1024) {
|
||||||
|
snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
|
||||||
|
} else {
|
||||||
|
snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
|
||||||
|
}
|
||||||
|
value = buf;
|
||||||
|
} else if (field == "params") {
|
||||||
|
if (t.model_n_params < 1000*1000*1000) {
|
||||||
|
snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
|
||||||
|
} else {
|
||||||
|
snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
|
||||||
|
}
|
||||||
|
value = buf;
|
||||||
} else if (field == "backend") {
|
} else if (field == "backend") {
|
||||||
value = test::get_backend();
|
value = test::get_backend();
|
||||||
} else if (field == "test") {
|
} else if (field == "test") {
|
||||||
char buf[128];
|
|
||||||
if (t.n_prompt > 0 && t.n_gen == 0) {
|
if (t.n_prompt > 0 && t.n_gen == 0) {
|
||||||
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
|
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
|
||||||
} else if (t.n_gen > 0 && t.n_prompt == 0) {
|
} else if (t.n_gen > 0 && t.n_prompt == 0) {
|
||||||
|
@ -787,7 +830,6 @@ struct markdown_printer : public printer {
|
||||||
}
|
}
|
||||||
value = buf;
|
value = buf;
|
||||||
} else if (field == "t/s") {
|
} else if (field == "t/s") {
|
||||||
char buf[128];
|
|
||||||
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
||||||
value = buf;
|
value = buf;
|
||||||
} else if (vmap.find(field) != vmap.end()) {
|
} else if (vmap.find(field) != vmap.end()) {
|
||||||
|
|
|
@ -201,13 +201,20 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
// Add BOS if SPM tokenizer
|
||||||
|
const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
LOG("Tokenize the prompt\n")
|
LOG("Tokenize the prompt\n")
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
|
if (llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM) {
|
||||||
|
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||||
|
params.prompt.insert(0, 1, ' ');
|
||||||
|
}
|
||||||
|
|
||||||
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
} else {
|
} else {
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
}
|
}
|
||||||
|
@ -229,11 +236,13 @@ int main(int argc, char ** argv) {
|
||||||
if (ctx_guidance) {
|
if (ctx_guidance) {
|
||||||
params.cfg_negative_prompt.insert(0, 1, ' ');
|
params.cfg_negative_prompt.insert(0, 1, ' ');
|
||||||
LOG("cfg_negative_prompt: \"%s\"\n", LOG_TOSTR(params.cfg_negative_prompt))
|
LOG("cfg_negative_prompt: \"%s\"\n", LOG_TOSTR(params.cfg_negative_prompt))
|
||||||
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, is_spm);
|
|
||||||
|
guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos);
|
||||||
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(guidance_inp,ctx_guidance))
|
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(guidance_inp,ctx_guidance))
|
||||||
|
|
||||||
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, is_spm);
|
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, add_bos);
|
||||||
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(original_inp,ctx))
|
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(original_inp,ctx))
|
||||||
|
|
||||||
original_prompt_len = original_inp.size();
|
original_prompt_len = original_inp.size();
|
||||||
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
|
||||||
LOG("original_prompt_len: %s", LOG_TOSTR(original_prompt_len))
|
LOG("original_prompt_len: %s", LOG_TOSTR(original_prompt_len))
|
||||||
|
@ -290,7 +299,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// prefix & suffix for instruct mode
|
// prefix & suffix for instruct mode
|
||||||
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", is_spm);
|
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos);
|
||||||
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false);
|
||||||
|
|
||||||
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
|
|
|
@ -6,6 +6,8 @@
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
#include <thread>
|
||||||
|
#include <mutex>
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
@ -27,6 +29,40 @@ std::vector<float> softmax(const std::vector<float>& logits) {
|
||||||
return probs;
|
return probs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
float log_softmax(int n_vocab, const float * logits, int tok) {
|
||||||
|
float max_logit = logits[0];
|
||||||
|
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
|
||||||
|
double sum_exp = 0.0;
|
||||||
|
for (int i = 0; i < n_vocab; ++i) sum_exp += expf(logits[i] - max_logit);
|
||||||
|
return logits[tok] - max_logit - log(sum_exp);
|
||||||
|
}
|
||||||
|
|
||||||
|
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread>& workers,
|
||||||
|
double& nll, double& nll2) {
|
||||||
|
|
||||||
|
std::mutex mutex;
|
||||||
|
int counter = 0;
|
||||||
|
auto compute = [&mutex, &counter, &nll, &nll2, n_vocab, logits, tokens, n_token] () {
|
||||||
|
double local_nll = 0, local_nll2 = 0;
|
||||||
|
while (true) {
|
||||||
|
std::unique_lock<std::mutex> lock(mutex);
|
||||||
|
int i = counter++;
|
||||||
|
if (i >= n_token) {
|
||||||
|
nll += local_nll; nll2 += local_nll2;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
lock.unlock();
|
||||||
|
double v = -log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
|
||||||
|
local_nll += v;
|
||||||
|
local_nll2 += v*v;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
for (auto& w : workers) w = std::thread(compute);
|
||||||
|
compute();
|
||||||
|
for (auto& w : workers) w.join();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
void perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
||||||
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
|
@ -166,9 +202,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
|
double nll2 = 0.0;
|
||||||
|
|
||||||
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);
|
||||||
|
|
||||||
|
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||||
|
|
||||||
for (int i = 0; i < n_chunk; ++i) {
|
for (int i = 0; i < n_chunk; ++i) {
|
||||||
const int start = i * params.n_ctx;
|
const int start = i * params.n_ctx;
|
||||||
const int end = start + params.n_ctx;
|
const int end = start + params.n_ctx;
|
||||||
|
@ -228,26 +267,32 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
// Example, we have a context window of 512, we will compute perplexity for each of the
|
// Example, we have a context window of 512, we will compute perplexity for each of the
|
||||||
// last 256 tokens. Then, we split the input up into context window size chunks to
|
// last 256 tokens. Then, we split the input up into context window size chunks to
|
||||||
// process the entire prompt.
|
// process the entire prompt.
|
||||||
for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
|
const int first = std::min(512, params.n_ctx/2);
|
||||||
// Calculate probability of next token, given the previous ones.
|
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, params.n_ctx - 1 - first, workers, nll, nll2);
|
||||||
const std::vector<float> tok_logits(
|
count += params.n_ctx - first - 1;
|
||||||
logits.begin() + (j + 0) * n_vocab,
|
|
||||||
logits.begin() + (j + 1) * n_vocab);
|
|
||||||
|
|
||||||
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
|
||||||
|
|
||||||
nll += -std::log(prob);
|
|
||||||
++count;
|
|
||||||
}
|
|
||||||
// perplexity is e^(average negative log-likelihood)
|
// perplexity is e^(average negative log-likelihood)
|
||||||
if (params.ppl_output_type == 0) {
|
if (params.ppl_output_type == 0) {
|
||||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||||
} else {
|
} else {
|
||||||
printf("%8d %.4lf\n", i*params.n_ctx, std::exp(nll / count));
|
double av = nll/count;
|
||||||
|
double av2 = nll2/count - av*av;
|
||||||
|
if (av2 > 0) av2 = sqrt(av2/(count-1));
|
||||||
|
printf("%8d %.4lf %4lf %4lf\n", i*params.n_ctx, std::exp(nll / count), av, av2);
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
nll2 /= count;
|
||||||
|
nll /= count;
|
||||||
|
nll2 -= nll * nll;
|
||||||
|
if (nll2 > 0) {
|
||||||
|
nll2 = sqrt(nll2/(count-1));
|
||||||
|
double ppl = exp(nll);
|
||||||
|
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||||
|
} else {
|
||||||
|
printf("Unexpected negative standard deviation of log(prob)\n");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
|
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
|
||||||
|
@ -306,6 +351,7 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
fprintf(stderr, "%s : loaded %zu tasks from prompt.\n", __func__, hs_task_count);
|
||||||
|
|
||||||
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
const bool is_spm = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
|
||||||
|
fprintf(stderr, "================================= is_spm = %d\n", is_spm);
|
||||||
|
|
||||||
// This is needed as usual for LLaMA models
|
// This is needed as usual for LLaMA models
|
||||||
const bool add_bos = is_spm;
|
const bool add_bos = is_spm;
|
||||||
|
@ -361,6 +407,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
double acc = 0.0f;
|
double acc = 0.0f;
|
||||||
const int n_vocab = llama_n_vocab(ctx);
|
const int n_vocab = llama_n_vocab(ctx);
|
||||||
|
|
||||||
|
std::vector<std::vector<int>> ending_tokens(4);
|
||||||
|
|
||||||
std::vector<float> tok_logits(n_vocab);
|
std::vector<float> tok_logits(n_vocab);
|
||||||
|
|
||||||
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
for (size_t task_idx = 0; task_idx < hs_task_count; task_idx++) {
|
||||||
|
@ -368,11 +416,21 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
|
std::vector<int> context_embd = ::llama_tokenize(ctx, hs_data[task_idx].context, add_bos);
|
||||||
size_t context_size = context_embd.size();
|
size_t context_size = context_embd.size();
|
||||||
|
|
||||||
|
for (int i = 0; i < 4; ++i) {
|
||||||
|
ending_tokens[i] = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[i], add_bos);
|
||||||
|
for (int k = 0; k < int(context_size); ++k) {
|
||||||
|
if (ending_tokens[i][k] != context_embd[k]) {
|
||||||
|
fprintf(stderr, "Oops: ending %d of task %d differs from context at position %d\n",i,int(task_idx),k);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Do the 1st ending
|
// Do the 1st ending
|
||||||
// In this case we include the context when evaluating
|
// In this case we include the context when evaluating
|
||||||
auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
|
//auto query_embd = ::llama_tokenize(ctx, hs_data[task_idx].context + hs_data[task_idx].ending[0], add_bos);
|
||||||
|
auto query_embd = ending_tokens[0];
|
||||||
auto query_size = query_embd.size();
|
auto query_size = query_embd.size();
|
||||||
//printf("First query: %d\n",(int)query_size);
|
|
||||||
|
|
||||||
// Stop if query wont fit the ctx window
|
// Stop if query wont fit the ctx window
|
||||||
if (query_size > (size_t)params.n_ctx) {
|
if (query_size > (size_t)params.n_ctx) {
|
||||||
|
@ -417,7 +475,8 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
|
for (size_t ending_idx = 1; ending_idx < 4; ending_idx++) {
|
||||||
|
|
||||||
// Tokenize the query
|
// Tokenize the query
|
||||||
query_embd = ::llama_tokenize(ctx, hs_data[task_idx].ending[ending_idx], false);
|
query_embd.resize(ending_tokens[ending_idx].size() - context_size);
|
||||||
|
std::memcpy(query_embd.data(), ending_tokens[ending_idx].data() + context_size, query_embd.size()*sizeof(int));
|
||||||
query_size = query_embd.size();
|
query_size = query_embd.size();
|
||||||
|
|
||||||
// Stop if query wont fit the ctx window
|
// Stop if query wont fit the ctx window
|
||||||
|
|
|
@ -77,34 +77,31 @@ You need to have [Node.js](https://nodejs.org/en) installed.
|
||||||
```bash
|
```bash
|
||||||
mkdir llama-client
|
mkdir llama-client
|
||||||
cd llama-client
|
cd llama-client
|
||||||
npm init
|
|
||||||
npm install axios
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Create a index.js file and put inside this:
|
Create a index.js file and put inside this:
|
||||||
|
|
||||||
```javascript
|
```javascript
|
||||||
const axios = require("axios");
|
|
||||||
|
|
||||||
const prompt = `Building a website can be done in 10 simple steps:`;
|
const prompt = `Building a website can be done in 10 simple steps:`;
|
||||||
|
|
||||||
async function Test() {
|
async function Test() {
|
||||||
let result = await axios.post("http://127.0.0.1:8080/completion", {
|
let response = await fetch("http://127.0.0.1:8080/completion", {
|
||||||
prompt,
|
method: 'POST',
|
||||||
n_predict: 512,
|
body: JSON.stringify({
|
||||||
});
|
prompt,
|
||||||
|
n_predict: 512,
|
||||||
// the response is received until completion finish
|
})
|
||||||
console.log(result.data.content);
|
})
|
||||||
|
console.log((await response.json()).content)
|
||||||
}
|
}
|
||||||
|
|
||||||
Test();
|
Test()
|
||||||
```
|
```
|
||||||
|
|
||||||
And run it:
|
And run it:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
node .
|
node index.js
|
||||||
```
|
```
|
||||||
|
|
||||||
## API Endpoints
|
## API Endpoints
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -102,6 +102,17 @@
|
||||||
padding: 0.5em;
|
padding: 0.5em;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.prob-set {
|
||||||
|
padding: 0.3em;
|
||||||
|
border-bottom: 1px solid #ccc;
|
||||||
|
}
|
||||||
|
|
||||||
|
.popover-content {
|
||||||
|
position: absolute;
|
||||||
|
background-color: white;
|
||||||
|
padding: 0.2em;
|
||||||
|
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
|
||||||
|
}
|
||||||
|
|
||||||
textarea {
|
textarea {
|
||||||
padding: 5px;
|
padding: 5px;
|
||||||
|
@ -133,11 +144,17 @@
|
||||||
font-size: 80%;
|
font-size: 80%;
|
||||||
color: #888;
|
color: #888;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@media (prefers-color-scheme: dark) {
|
||||||
|
.popover-content {
|
||||||
|
background-color: black;
|
||||||
|
}
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
|
|
||||||
<script type="module">
|
<script type="module">
|
||||||
import {
|
import {
|
||||||
html, h, signal, effect, computed, render, useSignal, useEffect, useRef
|
html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
|
||||||
} from '/index.js';
|
} from '/index.js';
|
||||||
|
|
||||||
import { llama } from '/completion.js';
|
import { llama } from '/completion.js';
|
||||||
|
@ -168,6 +185,7 @@
|
||||||
mirostat_tau: 5, // target entropy
|
mirostat_tau: 5, // target entropy
|
||||||
mirostat_eta: 0.1, // learning rate
|
mirostat_eta: 0.1, // learning rate
|
||||||
grammar: '',
|
grammar: '',
|
||||||
|
n_probs: 0, // no completion_probabilities
|
||||||
})
|
})
|
||||||
|
|
||||||
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */
|
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */
|
||||||
|
@ -334,10 +352,21 @@
|
||||||
|
|
||||||
const prompt = template(session.value.template, {
|
const prompt = template(session.value.template, {
|
||||||
message: msg,
|
message: msg,
|
||||||
history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"),
|
history: session.value.transcript.flatMap(
|
||||||
|
([name, data]) =>
|
||||||
|
template(
|
||||||
|
session.value.historyTemplate,
|
||||||
|
{
|
||||||
|
name,
|
||||||
|
message: Array.isArray(data) ?
|
||||||
|
data.map(msg => msg.content).join('').replace(/^\s/, '') :
|
||||||
|
data,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
).join("\n"),
|
||||||
});
|
});
|
||||||
|
|
||||||
let currentMessage = '';
|
const currentMessages = [];
|
||||||
const history = session.value.transcript
|
const history = session.value.transcript
|
||||||
|
|
||||||
const llamaParams = {
|
const llamaParams = {
|
||||||
|
@ -347,15 +376,19 @@
|
||||||
|
|
||||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
|
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
|
||||||
const data = chunk.data;
|
const data = chunk.data;
|
||||||
currentMessage += data.content;
|
|
||||||
|
|
||||||
// remove leading whitespace
|
|
||||||
currentMessage = currentMessage.replace(/^\s+/, "")
|
|
||||||
|
|
||||||
transcriptUpdate([...history, ["{{char}}", currentMessage]])
|
|
||||||
|
|
||||||
if (data.stop) {
|
if (data.stop) {
|
||||||
console.log("Completion finished: '", currentMessage, "', summary: ", data);
|
while (
|
||||||
|
currentMessages.length > 0 &&
|
||||||
|
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
|
||||||
|
) {
|
||||||
|
currentMessages.pop();
|
||||||
|
}
|
||||||
|
transcriptUpdate([...history, ["{{char}}", currentMessages]])
|
||||||
|
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
|
||||||
|
} else {
|
||||||
|
currentMessages.push(data);
|
||||||
|
transcriptUpdate([...history, ["{{char}}", currentMessages]])
|
||||||
}
|
}
|
||||||
|
|
||||||
if (data.timings) {
|
if (data.timings) {
|
||||||
|
@ -420,8 +453,18 @@
|
||||||
}
|
}
|
||||||
}, [messages])
|
}, [messages])
|
||||||
|
|
||||||
const chatLine = ([user, msg]) => {
|
const chatLine = ([user, data], index) => {
|
||||||
return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdownish} text=${template(msg)} /></p>`
|
let message
|
||||||
|
const isArrayMessage = Array.isArray(data)
|
||||||
|
if (params.value.n_probs > 0 && isArrayMessage) {
|
||||||
|
message = html`<${Probabilities} data=${data} />`
|
||||||
|
} else {
|
||||||
|
const text = isArrayMessage ?
|
||||||
|
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
|
||||||
|
data;
|
||||||
|
message = html`<${Markdownish} text=${template(text)} />`
|
||||||
|
}
|
||||||
|
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
||||||
};
|
};
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
|
@ -568,10 +611,71 @@
|
||||||
${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
|
${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
|
||||||
${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
|
${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
<fieldset>
|
||||||
|
${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
|
||||||
|
</fieldset>
|
||||||
</details>
|
</details>
|
||||||
</form>
|
</form>
|
||||||
`
|
`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const probColor = (p) => {
|
||||||
|
const r = Math.floor(192 * (1 - p));
|
||||||
|
const g = Math.floor(192 * p);
|
||||||
|
return `rgba(${r},${g},0,0.3)`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const Probabilities = (params) => {
|
||||||
|
return params.data.map(msg => {
|
||||||
|
const { completion_probabilities } = msg;
|
||||||
|
if (
|
||||||
|
!completion_probabilities ||
|
||||||
|
completion_probabilities.length === 0
|
||||||
|
) return msg.content
|
||||||
|
|
||||||
|
if (completion_probabilities.length > 1) {
|
||||||
|
// Not for byte pair
|
||||||
|
if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
|
||||||
|
|
||||||
|
const splitData = completion_probabilities.map(prob => ({
|
||||||
|
content: prob.content,
|
||||||
|
completion_probabilities: [prob]
|
||||||
|
}))
|
||||||
|
return html`<${Probabilities} data=${splitData} />`
|
||||||
|
}
|
||||||
|
|
||||||
|
const { probs, content } = completion_probabilities[0]
|
||||||
|
const found = probs.find(p => p.tok_str === msg.content)
|
||||||
|
const pColor = found ? probColor(found.prob) : 'transparent'
|
||||||
|
|
||||||
|
const popoverChildren = html`
|
||||||
|
<div class="prob-set">
|
||||||
|
${probs.map((p, index) => {
|
||||||
|
return html`
|
||||||
|
<div
|
||||||
|
key=${index}
|
||||||
|
title=${`prob: ${p.prob}`}
|
||||||
|
style=${{
|
||||||
|
padding: '0.3em',
|
||||||
|
backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<span>${p.tok_str}: </span>
|
||||||
|
<span>${Math.floor(p.prob * 100)}%</span>
|
||||||
|
</div>
|
||||||
|
`
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
|
`
|
||||||
|
|
||||||
|
return html`
|
||||||
|
<${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
|
||||||
|
${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
|
||||||
|
</>
|
||||||
|
`
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// poor mans markdown replacement
|
// poor mans markdown replacement
|
||||||
const Markdownish = (params) => {
|
const Markdownish = (params) => {
|
||||||
const md = params.text
|
const md = params.text
|
||||||
|
@ -600,10 +704,121 @@
|
||||||
`
|
`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// simple popover impl
|
||||||
|
const Popover = (props) => {
|
||||||
|
const isOpen = useSignal(false);
|
||||||
|
const position = useSignal({ top: '0px', left: '0px' });
|
||||||
|
const buttonRef = useRef(null);
|
||||||
|
const popoverRef = useRef(null);
|
||||||
|
|
||||||
|
const togglePopover = () => {
|
||||||
|
if (buttonRef.current) {
|
||||||
|
const rect = buttonRef.current.getBoundingClientRect();
|
||||||
|
position.value = {
|
||||||
|
top: `${rect.bottom + window.scrollY}px`,
|
||||||
|
left: `${rect.left + window.scrollX}px`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
isOpen.value = !isOpen.value;
|
||||||
|
};
|
||||||
|
|
||||||
|
const handleClickOutside = (event) => {
|
||||||
|
if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
|
||||||
|
isOpen.value = false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
useEffect(() => {
|
||||||
|
document.addEventListener('mousedown', handleClickOutside);
|
||||||
|
return () => {
|
||||||
|
document.removeEventListener('mousedown', handleClickOutside);
|
||||||
|
};
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
return html`
|
||||||
|
<span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
|
||||||
|
${isOpen.value && html`
|
||||||
|
<${Portal} into="#portal">
|
||||||
|
<div
|
||||||
|
ref=${popoverRef}
|
||||||
|
class="popover-content"
|
||||||
|
style=${{
|
||||||
|
top: position.value.top,
|
||||||
|
left: position.value.left,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
${props.popoverChildren}
|
||||||
|
</div>
|
||||||
|
</${Portal}>
|
||||||
|
`}
|
||||||
|
`;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
|
||||||
|
/** Redirect rendering of descendants into the given CSS selector */
|
||||||
|
class Portal extends Component {
|
||||||
|
componentDidUpdate(props) {
|
||||||
|
for (let i in props) {
|
||||||
|
if (props[i] !== this.props[i]) {
|
||||||
|
return setTimeout(this.renderLayer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
componentDidMount() {
|
||||||
|
this.isMounted = true;
|
||||||
|
this.renderLayer = this.renderLayer.bind(this);
|
||||||
|
this.renderLayer();
|
||||||
|
}
|
||||||
|
|
||||||
|
componentWillUnmount() {
|
||||||
|
this.renderLayer(false);
|
||||||
|
this.isMounted = false;
|
||||||
|
if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
|
||||||
|
}
|
||||||
|
|
||||||
|
findNode(node) {
|
||||||
|
return typeof node === 'string' ? document.querySelector(node) : node;
|
||||||
|
}
|
||||||
|
|
||||||
|
renderLayer(show = true) {
|
||||||
|
if (!this.isMounted) return;
|
||||||
|
|
||||||
|
// clean up old node if moving bases:
|
||||||
|
if (this.props.into !== this.intoPointer) {
|
||||||
|
this.intoPointer = this.props.into;
|
||||||
|
if (this.into && this.remote) {
|
||||||
|
this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
|
||||||
|
}
|
||||||
|
this.into = this.findNode(this.props.into);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.remote = render(html`
|
||||||
|
<${PortalProxy} context=${this.context}>
|
||||||
|
${show && this.props.children || null}
|
||||||
|
</${PortalProxy}>
|
||||||
|
`, this.into, this.remote);
|
||||||
|
}
|
||||||
|
|
||||||
|
render() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// high-order component that renders its first child if it exists.
|
||||||
|
// used as a conditional rendering proxy.
|
||||||
|
class PortalProxy extends Component {
|
||||||
|
getChildContext() {
|
||||||
|
return this.props.context;
|
||||||
|
}
|
||||||
|
render({ children }) {
|
||||||
|
return children || null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function App(props) {
|
function App(props) {
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
<div id="container">
|
<div>
|
||||||
<header>
|
<header>
|
||||||
<h1>llama.cpp</h1>
|
<h1>llama.cpp</h1>
|
||||||
</header>
|
</header>
|
||||||
|
@ -624,11 +839,13 @@
|
||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
|
|
||||||
render(h(App), document.body);
|
render(h(App), document.querySelector('#container'));
|
||||||
</script>
|
</script>
|
||||||
</head>
|
</head>
|
||||||
|
|
||||||
<body>
|
<body>
|
||||||
|
<div id="container"></div>
|
||||||
|
<div id="portal"></div>
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -124,8 +124,9 @@ static void server_log(const char *level, const char *function, int line,
|
||||||
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
||||||
{
|
{
|
||||||
std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
|
std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
|
||||||
// if first bit is 1, meaning it's a partial character
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||||
if (out.size() > 0 && (out[0] & 0x80) == 0x80)
|
// (size > 1 meaning it's already a known token)
|
||||||
|
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
||||||
{
|
{
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << std::hex << (out[0] & 0xff);
|
ss << std::hex << (out[0] & 0xff);
|
||||||
|
@ -1208,6 +1209,62 @@ static void log_server_request(const Request &req, const Response &res)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
|
||||||
|
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Function matching type llama_beam_search_callback_fn_t.
|
||||||
|
// Custom callback example is called each time the beams lengths increase:
|
||||||
|
// * Show progress by printing ',' following by number of convergent beam tokens if any.
|
||||||
|
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||||
|
// This is also called when the stop condition is met.
|
||||||
|
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||||
|
void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
|
||||||
|
auto & llama = *static_cast<llama_server_context*>(callback_data);
|
||||||
|
// Mark beams as EOS as needed.
|
||||||
|
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||||
|
llama_beam_view& beam_view = beams_state.beam_views[i];
|
||||||
|
if (!beam_view.eob && is_at_eob(llama, beam_view.tokens, beam_view.n_tokens)) {
|
||||||
|
beam_view.eob = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf(","); // Show progress
|
||||||
|
if (const size_t n = beams_state.common_prefix_length) {
|
||||||
|
llama.generated_token_probs.resize(llama.generated_token_probs.size() + n);
|
||||||
|
assert(0u < beams_state.n_beams);
|
||||||
|
const llama_token * tokens = beams_state.beam_views[0].tokens;
|
||||||
|
const auto map = [](llama_token tok) { return completion_token_output{{},tok}; };
|
||||||
|
std::transform(tokens, tokens + n, llama.generated_token_probs.end() - n, map);
|
||||||
|
printf("%lu", n);
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
#if 0 // DEBUG: print current beams for this iteration
|
||||||
|
std::cout << "\n\nCurrent beams:\n";
|
||||||
|
for (size_t i=0 ; i < beams_state.n_beams ; ++i) {
|
||||||
|
std::cout << "beams["<<i<<"]: " << ostream_beam_view{state.ctx,beams_state.beam_views[i]} << std::endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
struct token_translator {
|
||||||
|
llama_context * ctx;
|
||||||
|
std::string operator()(llama_token tok) const { return llama_token_to_str(ctx, tok); }
|
||||||
|
std::string operator()(completion_token_output cto) const { return (*this)(cto.tok); }
|
||||||
|
};
|
||||||
|
|
||||||
|
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
|
||||||
|
auto & gtps = llama.generated_token_probs;
|
||||||
|
auto translator = token_translator{llama.ctx};
|
||||||
|
auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
|
||||||
|
const size_t len = std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
|
||||||
|
if (llama.generated_text.capacity() < llama.generated_text.size() + len) {
|
||||||
|
llama.generated_text.reserve(llama.generated_text.size() + len);
|
||||||
|
}
|
||||||
|
for (const completion_token_output & cto : gtps) {
|
||||||
|
llama.generated_text += translator(cto);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
int main(int argc, char **argv)
|
||||||
{
|
{
|
||||||
// own arguments required by this example
|
// own arguments required by this example
|
||||||
|
@ -1290,22 +1347,30 @@ int main(int argc, char **argv)
|
||||||
llama.beginCompletion();
|
llama.beginCompletion();
|
||||||
|
|
||||||
if (!llama.stream) {
|
if (!llama.stream) {
|
||||||
size_t stop_pos = std::string::npos;
|
if (llama.params.n_beams) {
|
||||||
|
// Fill llama.generated_token_probs vector with final beam.
|
||||||
|
llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
|
||||||
|
llama.n_past, llama.n_remain, llama.params.n_threads);
|
||||||
|
// Translate llama.generated_token_probs to llama.generated_text.
|
||||||
|
append_to_generated_text_from_generated_token_probs(llama);
|
||||||
|
} else {
|
||||||
|
size_t stop_pos = std::string::npos;
|
||||||
|
|
||||||
while (llama.has_next_token) {
|
while (llama.has_next_token) {
|
||||||
const completion_token_output token_with_probs = llama.doCompletion();
|
const completion_token_output token_with_probs = llama.doCompletion();
|
||||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
|
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
|
||||||
|
|
||||||
stop_pos = llama.findStoppingStrings(llama.generated_text,
|
stop_pos = llama.findStoppingStrings(llama.generated_text,
|
||||||
token_text.size(), STOP_FULL);
|
token_text.size(), STOP_FULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (stop_pos == std::string::npos) {
|
if (stop_pos == std::string::npos) {
|
||||||
stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
|
stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
|
||||||
}
|
}
|
||||||
if (stop_pos != std::string::npos) {
|
if (stop_pos != std::string::npos) {
|
||||||
llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
|
llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
|
||||||
llama.generated_text.end());
|
llama.generated_text.end());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
|
const json data = format_final_response(llama, llama.generated_text, llama.generated_token_probs);
|
||||||
|
@ -1321,59 +1386,86 @@ int main(int argc, char **argv)
|
||||||
|
|
||||||
while (llama.has_next_token) {
|
while (llama.has_next_token) {
|
||||||
const completion_token_output token_with_probs = llama.doCompletion();
|
const completion_token_output token_with_probs = llama.doCompletion();
|
||||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
|
if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
|
||||||
if (llama.multibyte_pending > 0) {
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok);
|
||||||
|
|
||||||
size_t pos = std::min(sent_count, llama.generated_text.size());
|
size_t pos = std::min(sent_count, llama.generated_text.size());
|
||||||
|
|
||||||
const std::string str_test = llama.generated_text.substr(pos);
|
const std::string str_test = llama.generated_text.substr(pos);
|
||||||
|
bool is_stop_full = false;
|
||||||
size_t stop_pos =
|
size_t stop_pos =
|
||||||
llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
|
llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
|
||||||
if (stop_pos != std::string::npos) {
|
if (stop_pos != std::string::npos) {
|
||||||
|
is_stop_full = true;
|
||||||
llama.generated_text.erase(
|
llama.generated_text.erase(
|
||||||
llama.generated_text.begin() + pos + stop_pos,
|
llama.generated_text.begin() + pos + stop_pos,
|
||||||
llama.generated_text.end());
|
llama.generated_text.end());
|
||||||
pos = std::min(sent_count, llama.generated_text.size());
|
pos = std::min(sent_count, llama.generated_text.size());
|
||||||
} else {
|
} else {
|
||||||
|
is_stop_full = false;
|
||||||
stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
|
stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
|
||||||
STOP_PARTIAL);
|
STOP_PARTIAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string to_send = llama.generated_text.substr(pos, stop_pos);
|
if (
|
||||||
sent_count += to_send.size();
|
stop_pos == std::string::npos ||
|
||||||
|
// Send rest of the text if we are at the end of the generation
|
||||||
|
(!llama.has_next_token && !is_stop_full && stop_pos > 0)
|
||||||
|
) {
|
||||||
|
const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
|
||||||
|
|
||||||
std::vector<completion_token_output> probs_output = {};
|
sent_count += to_send.size();
|
||||||
|
|
||||||
if (llama.params.n_probs > 0) {
|
std::vector<completion_token_output> probs_output = {};
|
||||||
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
|
||||||
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
if (llama.params.n_probs > 0) {
|
||||||
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
||||||
if (probs_pos < probs_stop_pos) {
|
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
||||||
probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
|
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
||||||
|
if (probs_pos < probs_stop_pos) {
|
||||||
|
probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
|
||||||
|
}
|
||||||
|
sent_token_probs_index = probs_stop_pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
const json data = format_partial_response(llama, to_send, probs_output);
|
||||||
|
|
||||||
|
const std::string str =
|
||||||
|
"data: " +
|
||||||
|
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
|
"\n\n";
|
||||||
|
|
||||||
|
LOG_VERBOSE("data stream", {
|
||||||
|
{ "to_send", str }
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!sink.write(str.data(), str.size())) {
|
||||||
|
LOG_VERBOSE("stream closed", {});
|
||||||
|
llama_print_timings(llama.ctx);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
sent_token_probs_index = probs_stop_pos;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const json data = llama.has_next_token
|
if (!llama.has_next_token) {
|
||||||
? format_partial_response(llama, to_send, probs_output)
|
// Generation is done, send extra information.
|
||||||
// Generation is done, send extra information.
|
const json data = format_final_response(llama, "", llama.generated_token_probs);
|
||||||
: format_final_response(llama, to_send, llama.generated_token_probs);
|
|
||||||
|
|
||||||
const std::string str =
|
const std::string str =
|
||||||
"data: " +
|
"data: " +
|
||||||
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||||
"\n\n";
|
"\n\n";
|
||||||
|
|
||||||
LOG_VERBOSE("data stream", {
|
LOG_VERBOSE("data stream", {
|
||||||
{ "to_send", str }
|
{ "to_send", str }
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!sink.write(str.data(), str.size())) {
|
if (!sink.write(str.data(), str.size())) {
|
||||||
LOG_VERBOSE("stream closed", {});
|
LOG_VERBOSE("stream closed", {});
|
||||||
llama_print_timings(llama.ctx);
|
llama_print_timings(llama.ctx);
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
11
flake.nix
11
flake.nix
|
@ -21,6 +21,12 @@
|
||||||
CoreGraphics
|
CoreGraphics
|
||||||
CoreVideo
|
CoreVideo
|
||||||
]
|
]
|
||||||
|
else if isDarwin then
|
||||||
|
with pkgs.darwin.apple_sdk.frameworks; [
|
||||||
|
Accelerate
|
||||||
|
CoreGraphics
|
||||||
|
CoreVideo
|
||||||
|
]
|
||||||
else
|
else
|
||||||
with pkgs; [ openblas ]
|
with pkgs; [ openblas ]
|
||||||
);
|
);
|
||||||
|
@ -80,8 +86,13 @@
|
||||||
type = "app";
|
type = "app";
|
||||||
program = "${self.packages.${system}.default}/bin/llama";
|
program = "${self.packages.${system}.default}/bin/llama";
|
||||||
};
|
};
|
||||||
|
apps.quantize = {
|
||||||
|
type = "app";
|
||||||
|
program = "${self.packages.${system}.default}/bin/quantize";
|
||||||
|
};
|
||||||
apps.default = self.apps.${system}.llama;
|
apps.default = self.apps.${system}.llama;
|
||||||
devShells.default = pkgs.mkShell {
|
devShells.default = pkgs.mkShell {
|
||||||
|
buildInputs = [ llama-python ];
|
||||||
packages = nativeBuildInputs ++ osSpecific;
|
packages = nativeBuildInputs ++ osSpecific;
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
|
|
||||||
#define UNUSED(x) (void)(x)
|
#define UNUSED(x) (void)(x)
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
||||||
|
|
||||||
//#define GGML_ALLOCATOR_DEBUG
|
//#define GGML_ALLOCATOR_DEBUG
|
||||||
|
|
||||||
|
@ -67,7 +68,7 @@ struct ggml_allocr {
|
||||||
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
||||||
size_t max_size;
|
size_t max_size;
|
||||||
bool measure;
|
bool measure;
|
||||||
int parse_seq[GGML_MAX_NODES];
|
int parse_seq[GGML_MAX_CONCUR];
|
||||||
int parse_seq_len;
|
int parse_seq_len;
|
||||||
|
|
||||||
#ifdef GGML_ALLOCATOR_DEBUG
|
#ifdef GGML_ALLOCATOR_DEBUG
|
||||||
|
|
231
ggml-cuda.cu
231
ggml-cuda.cu
|
@ -6,15 +6,116 @@
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
|
|
||||||
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
|
#include <hip/hip_runtime.h>
|
||||||
|
#include <hipblas/hipblas.h>
|
||||||
|
#include <hip/hip_fp16.h>
|
||||||
|
#ifdef __HIP_PLATFORM_AMD__
|
||||||
|
// for rocblas_initialize()
|
||||||
|
#include "rocblas/rocblas.h"
|
||||||
|
#endif
|
||||||
|
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
||||||
|
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||||
|
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||||
|
#define CUBLAS_OP_N HIPBLAS_OP_N
|
||||||
|
#define CUBLAS_OP_T HIPBLAS_OP_T
|
||||||
|
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||||
|
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
||||||
|
#define CUDA_R_16F HIPBLAS_R_16F
|
||||||
|
#define CUDA_R_32F HIPBLAS_R_32F
|
||||||
|
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||||
|
#define cublasCreate hipblasCreate
|
||||||
|
#define cublasGemmEx hipblasGemmEx
|
||||||
|
#define cublasHandle_t hipblasHandle_t
|
||||||
|
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||||
|
#define cublasSetStream hipblasSetStream
|
||||||
|
#define cublasSgemm hipblasSgemm
|
||||||
|
#define cublasStatus_t hipblasStatus_t
|
||||||
|
#define cudaDeviceProp hipDeviceProp_t
|
||||||
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||||
|
#define cudaError_t hipError_t
|
||||||
|
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||||
|
#define cudaEventDisableTiming hipEventDisableTiming
|
||||||
|
#define cudaEventRecord hipEventRecord
|
||||||
|
#define cudaEvent_t hipEvent_t
|
||||||
|
#define cudaEventDestroy hipEventDestroy
|
||||||
|
#define cudaFree hipFree
|
||||||
|
#define cudaFreeHost hipHostFree
|
||||||
|
#define cudaGetDevice hipGetDevice
|
||||||
|
#define cudaGetDeviceCount hipGetDeviceCount
|
||||||
|
#define cudaGetDeviceProperties hipGetDeviceProperties
|
||||||
|
#define cudaGetErrorString hipGetErrorString
|
||||||
|
#define cudaGetLastError hipGetLastError
|
||||||
|
#define cudaMalloc hipMalloc
|
||||||
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||||
|
#define cudaMemcpy hipMemcpy
|
||||||
|
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
||||||
|
#define cudaMemcpyAsync hipMemcpyAsync
|
||||||
|
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||||
|
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||||
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||||
|
#define cudaMemcpyKind hipMemcpyKind
|
||||||
|
#define cudaMemset hipMemset
|
||||||
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||||
|
#define cudaSetDevice hipSetDevice
|
||||||
|
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||||
|
#define cudaStreamNonBlocking hipStreamNonBlocking
|
||||||
|
#define cudaStreamSynchronize hipStreamSynchronize
|
||||||
|
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
|
||||||
|
#define cudaStream_t hipStream_t
|
||||||
|
#define cudaSuccess hipSuccess
|
||||||
|
#else
|
||||||
#include <cuda_runtime.h>
|
#include <cuda_runtime.h>
|
||||||
#include <cublas_v2.h>
|
#include <cublas_v2.h>
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||||
|
#ifndef CC_TURING
|
||||||
#define CC_TURING 700
|
#define CC_TURING 700
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
|
#define __CUDA_ARCH__ 1300
|
||||||
|
|
||||||
|
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||||
|
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||||
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||||
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||||
|
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
||||||
|
return reinterpret_cast<const int&>(c);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||||
|
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
||||||
|
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||||
|
#elif defined(__gfx1100__)
|
||||||
|
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||||
|
#elif defined(__gfx1010__) || defined(__gfx900__)
|
||||||
|
int tmp1;
|
||||||
|
int tmp2;
|
||||||
|
asm("\n \
|
||||||
|
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
||||||
|
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
||||||
|
v_add3_u32 %0, %1, %2, %0 \n \
|
||||||
|
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
||||||
|
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
||||||
|
v_add3_u32 %0, %1, %2, %0 \n \
|
||||||
|
"
|
||||||
|
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
||||||
|
: "v"(a), "v"(b)
|
||||||
|
);
|
||||||
|
#else
|
||||||
|
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||||
|
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||||
|
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
||||||
|
#endif
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
@ -424,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
||||||
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||||
|
|
||||||
const dfloat d = x[ib].dm.x;
|
const dfloat d = __low2half(x[ib].dm);
|
||||||
const dfloat m = x[ib].dm.y;
|
const dfloat m = __high2half(x[ib].dm);
|
||||||
|
|
||||||
const int vui = x[ib].qs[iqs];
|
const int vui = x[ib].qs[iqs];
|
||||||
|
|
||||||
|
@ -467,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
||||||
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||||
const block_q5_1 * x = (const block_q5_1 *) vx;
|
const block_q5_1 * x = (const block_q5_1 *) vx;
|
||||||
|
|
||||||
const dfloat d = x[ib].dm.x;
|
const dfloat d = __low2half(x[ib].dm);
|
||||||
const dfloat m = x[ib].dm.y;
|
const dfloat m = __high2half(x[ib].dm);
|
||||||
|
|
||||||
uint32_t qh;
|
uint32_t qh;
|
||||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||||
|
@ -520,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
||||||
const uint8_t q = x[i].qs[32*n + l];
|
const uint8_t q = x[i].qs[32*n + l];
|
||||||
float * y = yy + i*QK_K + 128*n;
|
float * y = yy + i*QK_K + 128*n;
|
||||||
|
|
||||||
float dall = x[i].dm.x;
|
float dall = __low2half(x[i].dm);
|
||||||
float dmin = x[i].dm.y;
|
float dmin = __high2half(x[i].dm);
|
||||||
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
||||||
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||||
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
||||||
|
@ -531,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
||||||
const int il = tid%16; // 0...15
|
const int il = tid%16; // 0...15
|
||||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||||
float * y = yy + i*QK_K + 16*is + il;
|
float * y = yy + i*QK_K + 16*is + il;
|
||||||
float dall = x[i].dm.x;
|
float dall = __low2half(x[i].dm);
|
||||||
float dmin = x[i].dm.y;
|
float dmin = __high2half(x[i].dm);
|
||||||
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
||||||
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||||
#endif
|
#endif
|
||||||
|
@ -618,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
||||||
|
|
||||||
float * y = yy + i*QK_K + 64*il + n*ir;
|
float * y = yy + i*QK_K + 64*il + n*ir;
|
||||||
|
|
||||||
const float dall = x[i].dm.x;
|
const float dall = __low2half(x[i].dm);
|
||||||
const float dmin = x[i].dm.y;
|
const float dmin = __high2half(x[i].dm);
|
||||||
|
|
||||||
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
||||||
|
|
||||||
|
@ -657,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
||||||
|
|
||||||
float * y = yy + i*QK_K + 64*il + 2*ir;
|
float * y = yy + i*QK_K + 64*il + 2*ir;
|
||||||
|
|
||||||
const float dall = x[i].dm.x;
|
const float dall = __low2half(x[i].dm);
|
||||||
const float dmin = x[i].dm.y;
|
const float dmin = __high2half(x[i].dm);
|
||||||
|
|
||||||
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
||||||
const uint8_t * qh = x[i].qh + 2*ir;
|
const uint8_t * qh = x[i].qh + 2*ir;
|
||||||
|
@ -770,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
||||||
const float * y = yy + i * QK_K + y_offset;
|
const float * y = yy + i * QK_K + y_offset;
|
||||||
const uint8_t * q = x[i].qs + q_offset;
|
const uint8_t * q = x[i].qs + q_offset;
|
||||||
|
|
||||||
const float dall = x[i].dm.x;
|
const float dall = __low2half(x[i].dm);
|
||||||
const float dmin = x[i].dm.y;
|
const float dmin = __high2half(x[i].dm);
|
||||||
|
|
||||||
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
||||||
aux[0] = a[0] & 0x0f0f0f0f;
|
aux[0] = a[0] & 0x0f0f0f0f;
|
||||||
|
@ -991,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
||||||
const float * y1 = yy + i*QK_K + y_offset;
|
const float * y1 = yy + i*QK_K + y_offset;
|
||||||
const float * y2 = y1 + 128;
|
const float * y2 = y1 + 128;
|
||||||
|
|
||||||
const float dall = x[i].dm.x;
|
const float dall = __low2half(x[i].dm);
|
||||||
const float dmin = x[i].dm.y;
|
const float dmin = __high2half(x[i].dm);
|
||||||
|
|
||||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||||
aux[0] = a[im+0] & kmask1;
|
aux[0] = a[im+0] & kmask1;
|
||||||
|
@ -1124,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
||||||
const float * y1 = yy + i*QK_K + y_offset;
|
const float * y1 = yy + i*QK_K + y_offset;
|
||||||
const float * y2 = y1 + 128;
|
const float * y2 = y1 + 128;
|
||||||
|
|
||||||
const float dall = x[i].dm.x;
|
const float dall = __low2half(x[i].dm);
|
||||||
const float dmin = x[i].dm.y;
|
const float dmin = __high2half(x[i].dm);
|
||||||
|
|
||||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||||
aux[0] = a[im+0] & kmask1;
|
aux[0] = a[im+0] & kmask1;
|
||||||
|
@ -1348,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
y[ib].ds.x = d;
|
reinterpret_cast<half&>(y[ib].ds.x) = d;
|
||||||
y[ib].ds.y = sum;
|
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||||
|
@ -2346,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
||||||
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
||||||
}
|
}
|
||||||
|
|
||||||
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
|
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
|
||||||
}
|
}
|
||||||
|
|
||||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||||
|
@ -2432,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < QR2_K; ++ i) {
|
for (int i = 0; i < QR2_K; ++ i) {
|
||||||
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
||||||
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
||||||
}
|
}
|
||||||
|
|
||||||
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
||||||
|
@ -2551,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < QR3_K; ++i) {
|
for (int i = 0; i < QR3_K; ++i) {
|
||||||
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
||||||
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
||||||
}
|
}
|
||||||
|
|
||||||
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
||||||
|
@ -2720,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||||
|
|
||||||
for (int i = 0; i < QR4_K; ++i) {
|
for (int i = 0; i < QR4_K; ++i) {
|
||||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||||
d8[i] = bq8i->ds.x;
|
d8[i] = __low2half(bq8i->ds);
|
||||||
|
|
||||||
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
||||||
u[2*i+0] = q8[0];
|
u[2*i+0] = q8[0];
|
||||||
|
@ -2747,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
||||||
const float dall = bq4_K->d[0];
|
const float dall = bq4_K->d[0];
|
||||||
const float dmin = bq4_K->d[1];
|
const float dmin = bq4_K->d[1];
|
||||||
|
|
||||||
const float d8_1 = bq8_1[0].ds.x;
|
const float d8_1 = __low2float(bq8_1[0].ds);
|
||||||
const float d8_2 = bq8_1[1].ds.x;
|
const float d8_2 = __low2float(bq8_1[1].ds);
|
||||||
|
|
||||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
||||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
||||||
|
@ -2901,7 +3002,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < QR5_K; ++i) {
|
for (int i = 0; i < QR5_K; ++i) {
|
||||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||||
d8[i] = bq8i->ds.x;
|
d8[i] = __low2float(bq8i->ds);
|
||||||
|
|
||||||
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
||||||
u[2*i+0] = q8[0];
|
u[2*i+0] = q8[0];
|
||||||
|
@ -2919,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
||||||
|
|
||||||
const float d = bq5_K->d;
|
const float d = bq5_K->d;
|
||||||
|
|
||||||
const float d8_1 = bq8_1[0].ds.x;
|
const float d8_1 = __low2half(bq8_1[0].ds);
|
||||||
const float d8_2 = bq8_1[1].ds.x;
|
const float d8_2 = __low2half(bq8_1[1].ds);
|
||||||
|
|
||||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
||||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
||||||
|
@ -3075,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i = 0; i < QR6_K; ++i) {
|
for (int i = 0; i < QR6_K; ++i) {
|
||||||
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
||||||
d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
|
d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
|
||||||
}
|
}
|
||||||
|
|
||||||
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
||||||
|
@ -3243,7 +3344,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
||||||
*dsi_dst = *dsi_src;
|
*dsi_dst = *dsi_src;
|
||||||
} else {
|
} else {
|
||||||
float * dfi_dst = (float *) dsi_dst;
|
float * dfi_dst = (float *) dsi_dst;
|
||||||
*dfi_dst = (*dsi_src).x;
|
*dfi_dst = __low2half(*dsi_src);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3907,28 +4008,27 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
||||||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: this implementation is wrong!
|
static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
||||||
//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
||||||
// const float p_delta, const int p_delta_rows, const float theta_scale) {
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||||
// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
|
||||||
//
|
if (col >= ncols) {
|
||||||
// if (col >= ncols) {
|
return;
|
||||||
// return;
|
}
|
||||||
// }
|
|
||||||
//
|
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
// const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
const int i = row*ncols + col/2;
|
||||||
// const int i = row*ncols + col/2;
|
|
||||||
//
|
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
||||||
// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
const float sin_theta = sinf(theta);
|
||||||
// const float sin_theta = sinf(theta);
|
const float cos_theta = cosf(theta);
|
||||||
// const float cos_theta = cosf(theta);
|
|
||||||
//
|
const float x0 = x[i + 0];
|
||||||
// const float x0 = x[i + 0];
|
const float x1 = x[i + ncols/2];
|
||||||
// const float x1 = x[i + ncols/2];
|
|
||||||
//
|
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||||
// dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
||||||
// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
}
|
||||||
//}
|
|
||||||
|
|
||||||
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
||||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
@ -4799,13 +4899,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
||||||
|
|
||||||
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
||||||
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
||||||
GGML_ASSERT(nrows % 2 == 0);
|
GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
|
||||||
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
||||||
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||||
const dim3 block_nums(nrows, num_blocks_x, 1);
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
||||||
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
||||||
|
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
||||||
|
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
||||||
|
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||||
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
||||||
|
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
||||||
|
}
|
||||||
|
|
||||||
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
||||||
GGML_ASSERT(nrows % 4 == 0);
|
GGML_ASSERT(nrows % 4 == 0);
|
||||||
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
||||||
|
@ -4937,10 +5045,18 @@ void ggml_init_cublas() {
|
||||||
static bool initialized = false;
|
static bool initialized = false;
|
||||||
|
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
|
|
||||||
|
#ifdef __HIP_PLATFORM_AMD__
|
||||||
|
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||||
|
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
||||||
|
rocblas_initialize();
|
||||||
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
|
#endif
|
||||||
|
|
||||||
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
||||||
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
||||||
int64_t total_vram = 0;
|
int64_t total_vram = 0;
|
||||||
fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
|
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||||
|
@ -5548,8 +5664,9 @@ inline void ggml_cuda_op_rope(
|
||||||
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
||||||
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
||||||
} else if (is_neox) {
|
} else if (is_neox) {
|
||||||
GGML_ASSERT(false && "RoPE NeoX not implemented yet");
|
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
||||||
#pragma message("TODO: implement RoPE NeoX for CUDA")
|
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
||||||
|
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
||||||
} else {
|
} else {
|
||||||
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
||||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
||||||
|
|
|
@ -2,6 +2,14 @@
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#ifdef GGML_USE_HIPBLAS
|
||||||
|
#define GGML_CUDA_NAME "ROCm"
|
||||||
|
#define GGML_CUBLAS_NAME "hipBLAS"
|
||||||
|
#else
|
||||||
|
#define GGML_CUDA_NAME "CUDA"
|
||||||
|
#define GGML_CUBLAS_NAME "cuBLAS"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
21
gguf-py/LICENSE
Normal file
21
gguf-py/LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2023 Georgi Gerganov
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
55
gguf-py/README.md
Normal file
55
gguf-py/README.md
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
## gguf
|
||||||
|
|
||||||
|
This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
|
||||||
|
(GGML Universal File) format.
|
||||||
|
|
||||||
|
See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py)
|
||||||
|
as an example for its usage.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
```sh
|
||||||
|
pip install gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
## Development
|
||||||
|
Maintainers who participate in development of this package are advised to install it in editable mode:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
cd /path/to/llama.cpp/gguf-py
|
||||||
|
|
||||||
|
pip install --editable .
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`.
|
||||||
|
In this case, upgrade Pip to the latest:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
pip install --upgrade pip
|
||||||
|
```
|
||||||
|
|
||||||
|
## Publishing
|
||||||
|
To publish the package, you need to have `twine` and `build` installed:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
pip install build twine
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, folow these steps to release a new version:
|
||||||
|
|
||||||
|
1. Update the version in `pyproject.toml`.
|
||||||
|
2. Build the package:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python -m build
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Upload the generated distribution archives:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python -m twine upload dist/*
|
||||||
|
```
|
||||||
|
|
||||||
|
## TODO
|
||||||
|
- [ ] Add tests
|
||||||
|
- [ ] Include conversion scripts as command line entry points in this package.
|
||||||
|
- Add CI workflow for releasing the package.
|
1
gguf-py/gguf/__init__.py
Normal file
1
gguf-py/gguf/__init__.py
Normal file
|
@ -0,0 +1 @@
|
||||||
|
from .gguf import *
|
0
gguf.py → gguf-py/gguf/gguf.py
Executable file → Normal file
0
gguf.py → gguf-py/gguf/gguf.py
Executable file → Normal file
28
gguf-py/pyproject.toml
Normal file
28
gguf-py/pyproject.toml
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
[tool.poetry]
|
||||||
|
name = "gguf"
|
||||||
|
version = "0.2.1"
|
||||||
|
description = "Write ML models in GGUF for GGML"
|
||||||
|
authors = ["GGML <ggml@ggml.ai>"]
|
||||||
|
packages = [
|
||||||
|
{include = "gguf"},
|
||||||
|
]
|
||||||
|
readme = "README.md"
|
||||||
|
homepage = "https://ggml.ai"
|
||||||
|
repository = "https://github.com/ggerganov/llama.cpp"
|
||||||
|
keywords = ["ggml", "gguf", "llama.cpp"]
|
||||||
|
classifiers = [
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.poetry.dependencies]
|
||||||
|
python = ">=3.8"
|
||||||
|
numpy = ">=1.17"
|
||||||
|
|
||||||
|
[tool.poetry.dev-dependencies]
|
||||||
|
pytest = "^5.2"
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["poetry-core>=1.0.0"]
|
||||||
|
build-backend = "poetry.core.masonry.api"
|
7
gguf-py/tests/test_gguf.py
Normal file
7
gguf-py/tests/test_gguf.py
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
import gguf
|
||||||
|
|
||||||
|
# TODO: add tests
|
||||||
|
|
||||||
|
|
||||||
|
def test_write_gguf():
|
||||||
|
pass
|
362
llama.cpp
362
llama.cpp
|
@ -1635,7 +1635,7 @@ static void llm_load_hparams(
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: This should probably be in llama.h
|
// TODO: This should probably be in llama.h
|
||||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape);
|
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos);
|
||||||
|
|
||||||
static void llm_load_vocab(
|
static void llm_load_vocab(
|
||||||
llama_model_loader & ml,
|
llama_model_loader & ml,
|
||||||
|
@ -1737,7 +1737,7 @@ static void llm_load_vocab(
|
||||||
}
|
}
|
||||||
|
|
||||||
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
// determine the newline token: LLaMA "<0x0A>" == 10 == '\n', Falcon 193 == '\n'
|
||||||
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false, false)[0];
|
vocab.linefeed_id = llama_tokenize_internal(vocab, "\n", false)[0];
|
||||||
|
|
||||||
// special tokens
|
// special tokens
|
||||||
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
GGUF_GET_KEY(ctx, vocab.special_bos_id, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_TOKENIZER_BOS_ID));
|
||||||
|
@ -1836,7 +1836,7 @@ static void llm_load_tensors(
|
||||||
(void) main_gpu;
|
(void) main_gpu;
|
||||||
(void) mul_mat_q;
|
(void) mul_mat_q;
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CUBLAS)
|
||||||
LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
|
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
||||||
ggml_cuda_set_main_device(main_gpu);
|
ggml_cuda_set_main_device(main_gpu);
|
||||||
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
||||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||||
|
@ -1958,6 +1958,14 @@ static void llm_load_tensors(
|
||||||
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||||
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
||||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||||
|
|
||||||
|
if (backend_norm == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights += ggml_nbytes(model.output_norm);
|
||||||
|
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||||
|
}
|
||||||
|
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||||
|
vram_weights += ggml_nbytes(model.output);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const uint32_t n_ff = hparams.n_ff;
|
const uint32_t n_ff = hparams.n_ff;
|
||||||
|
@ -1967,7 +1975,7 @@ static void llm_load_tensors(
|
||||||
model.layers.resize(n_layer);
|
model.layers.resize(n_layer);
|
||||||
|
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||||
|
|
||||||
auto & layer = model.layers[i];
|
auto & layer = model.layers[i];
|
||||||
|
@ -1978,6 +1986,11 @@ static void llm_load_tensors(
|
||||||
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
||||||
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
|
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
|
||||||
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
|
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
|
||||||
|
|
||||||
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights += ggml_nbytes(layer.attn_norm_2);
|
||||||
|
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||||
|
@ -1985,6 +1998,13 @@ static void llm_load_tensors(
|
||||||
|
|
||||||
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||||
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||||
|
|
||||||
|
if (backend == GGML_BACKEND_GPU) {
|
||||||
|
vram_weights +=
|
||||||
|
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
||||||
|
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
|
||||||
|
ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
|
@ -3007,14 +3027,8 @@ static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string llama_escape_whitespace(const std::string& text) {
|
static std::string llama_escape_whitespace(const std::string& text) {
|
||||||
std::string result = "\xe2\x96\x81";
|
std::string result = text;
|
||||||
for (size_t offs = 0; offs < text.length(); ++offs) {
|
replace_all(result, " ", "\xe2\x96\x81");
|
||||||
if (text[offs] == ' ') {
|
|
||||||
result += "\xe2\x96\x81";
|
|
||||||
} else {
|
|
||||||
result += text[offs];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3199,7 +3213,7 @@ struct llm_bigram_bpe {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llm_tokenizer_bpe {
|
struct llm_tokenizer_bpe {
|
||||||
llm_tokenizer_bpe(const llama_vocab & vocab, bool g2ws): vocab(vocab) { flag_g2ws = g2ws; }
|
llm_tokenizer_bpe(const llama_vocab & vocab): vocab(vocab) {}
|
||||||
|
|
||||||
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||||
int final_prev_index = -1;
|
int final_prev_index = -1;
|
||||||
|
@ -3351,8 +3365,6 @@ private:
|
||||||
return words;
|
return words;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool flag_g2ws = false;
|
|
||||||
|
|
||||||
const llama_vocab & vocab;
|
const llama_vocab & vocab;
|
||||||
|
|
||||||
std::vector<llm_symbol> symbols;
|
std::vector<llm_symbol> symbols;
|
||||||
|
@ -3361,39 +3373,26 @@ private:
|
||||||
llm_bigram_bpe::queue work_queue;
|
llm_bigram_bpe::queue work_queue;
|
||||||
};
|
};
|
||||||
|
|
||||||
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos, bool escape) {
|
static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab & vocab, const std::string & raw_text, bool bos) {
|
||||||
std::vector<llama_vocab::id> output;
|
std::vector<llama_vocab::id> output;
|
||||||
|
|
||||||
if (raw_text.empty()) {
|
if (raw_text.empty()) {
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (bos && vocab.special_bos_id != -1) {
|
||||||
|
output.push_back(vocab.special_bos_id);
|
||||||
|
}
|
||||||
|
|
||||||
switch (vocab.type) {
|
switch (vocab.type) {
|
||||||
case LLAMA_VOCAB_TYPE_SPM:
|
case LLAMA_VOCAB_TYPE_SPM:
|
||||||
{
|
{
|
||||||
llm_tokenizer_spm tokenizer(vocab);
|
llm_tokenizer_spm tokenizer(vocab);
|
||||||
|
tokenizer.tokenize(llama_escape_whitespace(raw_text), output);
|
||||||
if (bos) {
|
|
||||||
output.push_back(vocab.special_bos_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string text;
|
|
||||||
if (escape) {
|
|
||||||
text = llama_escape_whitespace(raw_text);
|
|
||||||
} else {
|
|
||||||
text = raw_text;
|
|
||||||
}
|
|
||||||
|
|
||||||
tokenizer.tokenize(text, output);
|
|
||||||
} break;
|
} break;
|
||||||
case LLAMA_VOCAB_TYPE_BPE:
|
case LLAMA_VOCAB_TYPE_BPE:
|
||||||
{
|
{
|
||||||
llm_tokenizer_bpe tokenizer(vocab, escape);
|
llm_tokenizer_bpe tokenizer(vocab);
|
||||||
|
|
||||||
if (bos && vocab.special_bos_id != -1) {
|
|
||||||
output.push_back(vocab.special_bos_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
tokenizer.tokenize(raw_text, output);
|
tokenizer.tokenize(raw_text, output);
|
||||||
} break;
|
} break;
|
||||||
};
|
};
|
||||||
|
@ -4306,6 +4305,257 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Beam search
|
||||||
|
//
|
||||||
|
|
||||||
|
struct llama_beam {
|
||||||
|
std::vector<llama_token> tokens;
|
||||||
|
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||||
|
bool eob; // Initialize end-of-beam to false. Callback sets this to true.
|
||||||
|
// Sort beams by probability. In case of ties, prefer beams at eob.
|
||||||
|
bool operator<(const llama_beam & rhs) const {
|
||||||
|
return std::make_pair(p, eob) < std::make_pair(rhs.p, rhs.eob);
|
||||||
|
}
|
||||||
|
// Shift off first n tokens and discard them.
|
||||||
|
void shift_tokens(const size_t n) {
|
||||||
|
if (n) {
|
||||||
|
std::copy(tokens.begin() + n, tokens.end(), tokens.begin());
|
||||||
|
tokens.resize(tokens.size() - n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
llama_beam_view view() const { return {tokens.data(), tokens.size(), p, eob}; }
|
||||||
|
};
|
||||||
|
|
||||||
|
// A struct for calculating logit-related info.
|
||||||
|
struct llama_logit_info {
|
||||||
|
const float * const logits;
|
||||||
|
const int n_vocab;
|
||||||
|
const float max_l;
|
||||||
|
const float normalizer;
|
||||||
|
struct sum_exp {
|
||||||
|
float max_l;
|
||||||
|
float operator()(float sum, float l) const { return sum + std::exp(l - max_l); }
|
||||||
|
};
|
||||||
|
llama_logit_info(llama_context * ctx)
|
||||||
|
: logits(llama_get_logits(ctx))
|
||||||
|
, n_vocab(llama_n_vocab(ctx))
|
||||||
|
, max_l(*std::max_element(logits, logits + n_vocab))
|
||||||
|
, normalizer(1.0f / std::accumulate(logits, logits + n_vocab, 0.0f, sum_exp{max_l}))
|
||||||
|
{ }
|
||||||
|
llama_token_data get_token_data(const llama_token token_id) const {
|
||||||
|
constexpr auto p = std::numeric_limits<float>::quiet_NaN(); // never used
|
||||||
|
return {token_id, logits[token_id], p};
|
||||||
|
}
|
||||||
|
// Return top k token_data by logit.
|
||||||
|
std::vector<llama_token_data> top_k(size_t k) {
|
||||||
|
std::vector<llama_token_data> min_heap; // min-heap by logit
|
||||||
|
const llama_token k_min = std::min(static_cast<llama_token>(k), n_vocab);
|
||||||
|
min_heap.reserve(k_min);
|
||||||
|
for (llama_token token_id = 0 ; token_id < k_min ; ++token_id) {
|
||||||
|
min_heap.push_back(get_token_data(token_id));
|
||||||
|
}
|
||||||
|
auto comp = [](const llama_token_data & a, const llama_token_data & b) { return a.logit > b.logit; };
|
||||||
|
std::make_heap(min_heap.begin(), min_heap.end(), comp);
|
||||||
|
for (llama_token token_id = k_min ; token_id < n_vocab ; ++token_id) {
|
||||||
|
if (min_heap.front().logit < logits[token_id]) {
|
||||||
|
std::pop_heap(min_heap.begin(), min_heap.end(), comp);
|
||||||
|
min_heap.back().id = token_id;
|
||||||
|
min_heap.back().logit = logits[token_id];
|
||||||
|
std::push_heap(min_heap.begin(), min_heap.end(), comp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return min_heap;
|
||||||
|
}
|
||||||
|
float probability_from_logit(float logit) {
|
||||||
|
return normalizer * std::exp(logit - max_l);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_beam_search_data {
|
||||||
|
llama_context * ctx;
|
||||||
|
size_t n_beams;
|
||||||
|
int n_past;
|
||||||
|
int n_predict;
|
||||||
|
int n_threads;
|
||||||
|
std::vector<llama_beam> beams;
|
||||||
|
std::vector<llama_beam> next_beams;
|
||||||
|
|
||||||
|
// Re-calculated on each loop iteration
|
||||||
|
size_t common_prefix_length;
|
||||||
|
|
||||||
|
// Used to communicate to/from callback on beams state.
|
||||||
|
std::vector<llama_beam_view> beam_views;
|
||||||
|
|
||||||
|
llama_beam_search_data(llama_context * ctx, size_t n_beams, int n_past, int n_predict, int n_threads)
|
||||||
|
: ctx(ctx)
|
||||||
|
, n_beams(n_beams)
|
||||||
|
, n_past(n_past)
|
||||||
|
, n_predict(n_predict)
|
||||||
|
, n_threads(n_threads)
|
||||||
|
, beam_views(n_beams) {
|
||||||
|
beams.reserve(n_beams);
|
||||||
|
next_beams.reserve(n_beams);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Collapse beams to a single beam given by index.
|
||||||
|
void collapse_beams(const size_t beam_idx) {
|
||||||
|
if (0u < beam_idx) {
|
||||||
|
std::swap(beams[0], beams[beam_idx]);
|
||||||
|
}
|
||||||
|
beams.resize(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Min-heaps are used to efficiently collect the top-k elements (k=n_beams).
|
||||||
|
// The repetative patterns below reflect the 2 stages of heaps:
|
||||||
|
// * Gather elements until the vector is full, then call std::make_heap() on it.
|
||||||
|
// * If the heap is full and a new element is found that should be included, pop the
|
||||||
|
// least element to the back(), replace it with the new, then push it into the heap.
|
||||||
|
void fill_next_beams_by_top_probabilities(llama_beam & beam) {
|
||||||
|
// Min-heaps use a greater-than comparator.
|
||||||
|
const auto comp = [](const llama_beam & a, const llama_beam & b) { return a.p > b.p; };
|
||||||
|
if (beam.eob) {
|
||||||
|
// beam is at end-of-sentence, so just copy it to next_beams if its probability is high enough.
|
||||||
|
if (next_beams.size() < n_beams) {
|
||||||
|
next_beams.push_back(std::move(beam));
|
||||||
|
if (next_beams.size() == n_beams) {
|
||||||
|
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
}
|
||||||
|
} else if (next_beams.front().p < beam.p) {
|
||||||
|
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
next_beams.back() = std::move(beam);
|
||||||
|
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// beam is not at end-of-sentence, so branch with next top_k tokens.
|
||||||
|
if (!beam.tokens.empty()) {
|
||||||
|
llama_eval(ctx, beam.tokens.data(), beam.tokens.size(), n_past, n_threads);
|
||||||
|
}
|
||||||
|
llama_logit_info logit_info(ctx);
|
||||||
|
std::vector<llama_token_data> next_tokens = logit_info.top_k(n_beams);
|
||||||
|
size_t i=0;
|
||||||
|
if (next_beams.size() < n_beams) {
|
||||||
|
for (; next_beams.size() < n_beams ; ++i) {
|
||||||
|
llama_beam next_beam = beam;
|
||||||
|
next_beam.tokens.push_back(next_tokens[i].id);
|
||||||
|
next_beam.p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
||||||
|
next_beams.push_back(std::move(next_beam));
|
||||||
|
}
|
||||||
|
std::make_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
} else {
|
||||||
|
for (; next_beams.front().p == 0.0f ; ++i) {
|
||||||
|
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
next_beams.back() = beam;
|
||||||
|
next_beams.back().tokens.push_back(next_tokens[i].id);
|
||||||
|
next_beams.back().p *= logit_info.probability_from_logit(next_tokens[i].logit);
|
||||||
|
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (; i < n_beams ; ++i) {
|
||||||
|
const float next_p = beam.p * logit_info.probability_from_logit(next_tokens[i].logit);
|
||||||
|
if (next_beams.front().p < next_p) {
|
||||||
|
std::pop_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
next_beams.back() = beam;
|
||||||
|
next_beams.back().tokens.push_back(next_tokens[i].id);
|
||||||
|
next_beams.back().p = next_p;
|
||||||
|
std::push_heap(next_beams.begin(), next_beams.end(), comp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find common_prefix_length based on beams.
|
||||||
|
// Requires beams is not empty.
|
||||||
|
size_t find_common_prefix_length() {
|
||||||
|
size_t common_prefix_length = beams[0].tokens.size();
|
||||||
|
for (size_t i = 1 ; i < beams.size() ; ++i) {
|
||||||
|
common_prefix_length = std::min(common_prefix_length, beams[i].tokens.size());
|
||||||
|
for (size_t j = 0 ; j < common_prefix_length ; ++j) {
|
||||||
|
if (beams[0].tokens[j] != beams[i].tokens[j]) {
|
||||||
|
common_prefix_length = j;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return common_prefix_length;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Construct beams_state to send back to caller via the callback function.
|
||||||
|
// Side effect: set common_prefix_length = find_common_prefix_length();
|
||||||
|
llama_beams_state get_beams_state(const bool last_call) {
|
||||||
|
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
||||||
|
beam_views[i] = beams[i].view();
|
||||||
|
}
|
||||||
|
common_prefix_length = find_common_prefix_length();
|
||||||
|
return {beam_views.data(), beams.size(), common_prefix_length, last_call};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loop:
|
||||||
|
// * while i < n_predict, AND
|
||||||
|
// * any of the beams have not yet reached end-of-beam (eob), AND
|
||||||
|
// * the highest probability beam(s) (plural in case of ties) are not at end-of-sentence
|
||||||
|
// (since all other beam probabilities can only decrease)
|
||||||
|
void loop(const llama_beam_search_callback_fn_t callback, void * const callback_data) {
|
||||||
|
beams.push_back({{}, 1.0f, false}); // Start with one empty beam w/ probability = 1.0 and !eob.
|
||||||
|
const auto not_eob = [](const llama_beam & beam) { return !beam.eob; };
|
||||||
|
for (int i = 0 ; i < n_predict && std::any_of(beams.begin(),beams.end(),not_eob) &&
|
||||||
|
!beams[top_beam_index()].eob ; ++i) {
|
||||||
|
callback(callback_data, get_beams_state(false)); // Sets common_prefix_length
|
||||||
|
update_beams_from_beam_views(); // Update values (p,eob) that callback may have changed.
|
||||||
|
if (common_prefix_length) {
|
||||||
|
llama_eval(ctx, beams[0].tokens.data(), common_prefix_length, n_past, n_threads);
|
||||||
|
n_past += common_prefix_length;
|
||||||
|
}
|
||||||
|
// Zero-out next_beam probabilities to place them last in following min-heap.
|
||||||
|
std::for_each(next_beams.begin(), next_beams.end(), [](llama_beam & beam) { beam.p = 0.0f; });
|
||||||
|
for (llama_beam & beam : beams) {
|
||||||
|
beam.shift_tokens(common_prefix_length);
|
||||||
|
fill_next_beams_by_top_probabilities(beam);
|
||||||
|
}
|
||||||
|
// next_beams become the beams of next/final iteration. Swap them to re-use memory.
|
||||||
|
beams.swap(next_beams);
|
||||||
|
renormalize_beam_probabilities(beams);
|
||||||
|
}
|
||||||
|
collapse_beams(top_beam_index());
|
||||||
|
callback(callback_data, get_beams_state(true));
|
||||||
|
}
|
||||||
|
|
||||||
|
// As beams grow, the cumulative probabilities decrease.
|
||||||
|
// Renormalize them to avoid floating point underflow.
|
||||||
|
static void renormalize_beam_probabilities(std::vector<llama_beam> & beams) {
|
||||||
|
const auto sum_p = [](float sum, llama_beam & beam) { return sum + beam.p; };
|
||||||
|
const float inv_sum = 1.0f / std::accumulate(beams.begin(), beams.end(), 0.0f, sum_p);
|
||||||
|
std::for_each(beams.begin(), beams.end(), [=](llama_beam & beam) { beam.p *= inv_sum; });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Assumes beams is non-empty. Uses llama_beam::operator<() for ordering.
|
||||||
|
size_t top_beam_index() {
|
||||||
|
return std::max_element(beams.begin(), beams.end()) - beams.begin();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Copy (p,eob) for each beam which may have been changed by the callback.
|
||||||
|
void update_beams_from_beam_views() {
|
||||||
|
for (size_t i = 0 ; i < beams.size() ; ++i) {
|
||||||
|
beams[i].p = beam_views[i].p;
|
||||||
|
beams[i].eob = beam_views[i].eob;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void llama_beam_search(llama_context * ctx,
|
||||||
|
llama_beam_search_callback_fn_t callback, void * callback_data,
|
||||||
|
size_t n_beams, int n_past, int n_predict, int n_threads) {
|
||||||
|
assert(ctx);
|
||||||
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
|
llama_beam_search_data beam_search_data(ctx, n_beams, n_past, n_predict, n_threads);
|
||||||
|
|
||||||
|
beam_search_data.loop(callback, callback_data);
|
||||||
|
|
||||||
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
|
ctx->n_sample++;
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// quantization
|
// quantization
|
||||||
//
|
//
|
||||||
|
@ -4403,6 +4653,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname_inp, /*use_mmap*/ false));
|
||||||
|
|
||||||
|
llama_model model;
|
||||||
|
llm_load_arch(*ml, model);
|
||||||
|
llm_load_hparams(*ml, model, 0, 0, 0);
|
||||||
|
|
||||||
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
||||||
struct gguf_context * ctx_out = gguf_init_empty();
|
struct gguf_context * ctx_out = gguf_init_empty();
|
||||||
|
|
||||||
|
@ -4428,6 +4682,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
++n_feed_forward_w2;
|
++n_feed_forward_w2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (n_attention_wv != n_feed_forward_w2 || (uint32_t)n_attention_wv != model.hparams.n_layer) {
|
||||||
|
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_feed_forward_w2 = %d, hparams.n_layer = %d\n",
|
||||||
|
__func__, n_attention_wv, n_feed_forward_w2, model.hparams.n_layer);
|
||||||
|
}
|
||||||
|
|
||||||
int i_attention_wv = 0;
|
int i_attention_wv = 0;
|
||||||
int i_feed_forward_w2 = 0;
|
int i_feed_forward_w2 = 0;
|
||||||
|
@ -4504,8 +4762,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
|
|
||||||
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
|
||||||
int nx = tensor->ne[0];
|
int nx = tensor->ne[0];
|
||||||
int ny = tensor->ne[1];
|
if (nx % QK_K == 0) {
|
||||||
if (nx % QK_K == 0 && ny % QK_K == 0) {
|
|
||||||
new_type = GGML_TYPE_Q6_K;
|
new_type = GGML_TYPE_Q6_K;
|
||||||
}
|
}
|
||||||
} else if (name.find("attn_v.weight") != std::string::npos) {
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
||||||
|
@ -4519,6 +4776,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
|
||||||
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
|
||||||
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
|
||||||
|
if (model.type == MODEL_70B) {
|
||||||
|
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
||||||
|
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
||||||
|
// nearly negligible increase in model size by quantizing this tensor with more bits:
|
||||||
|
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
|
||||||
|
}
|
||||||
++i_attention_wv;
|
++i_attention_wv;
|
||||||
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
} else if (name.find("ffn_down.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
||||||
|
@ -4548,8 +4811,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
|
||||||
int nx = tensor->ne[0];
|
int nx = tensor->ne[0];
|
||||||
int ny = tensor->ne[1];
|
int ny = tensor->ne[1];
|
||||||
if (nx % QK_K != 0 || ny % QK_K != 0) {
|
if (nx % QK_K != 0) {
|
||||||
LLAMA_LOG_INFO("\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
|
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
|
||||||
convert_incompatible_tensor = true;
|
convert_incompatible_tensor = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5277,13 +5540,29 @@ int llama_model_n_embd(const struct llama_model * model) {
|
||||||
return model->hparams.n_embd;
|
return model->hparams.n_embd;
|
||||||
}
|
}
|
||||||
|
|
||||||
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
|
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||||
return snprintf(buf, buf_size, "%s %s %s",
|
return snprintf(buf, buf_size, "%s %s %s",
|
||||||
model->name.c_str(),
|
model->name.c_str(),
|
||||||
llama_model_type_name(model->type),
|
llama_model_type_name(model->type),
|
||||||
llama_model_ftype_name(model->ftype).c_str());
|
llama_model_ftype_name(model->ftype).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t llama_model_size(const struct llama_model * model) {
|
||||||
|
uint64_t size = 0;
|
||||||
|
for (const auto & it : model->tensors_by_name) {
|
||||||
|
size += ggml_nbytes(it.second);
|
||||||
|
}
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t llama_model_n_params(const struct llama_model * model) {
|
||||||
|
uint64_t nparams = 0;
|
||||||
|
for (const auto & it : model->tensors_by_name) {
|
||||||
|
nparams += ggml_nelements(it.second);
|
||||||
|
}
|
||||||
|
return nparams;
|
||||||
|
}
|
||||||
|
|
||||||
int llama_model_quantize(
|
int llama_model_quantize(
|
||||||
const char * fname_inp,
|
const char * fname_inp,
|
||||||
const char * fname_out,
|
const char * fname_out,
|
||||||
|
@ -5808,8 +6087,7 @@ int llama_tokenize_with_model(
|
||||||
llama_token * tokens,
|
llama_token * tokens,
|
||||||
int n_max_tokens,
|
int n_max_tokens,
|
||||||
bool add_bos) {
|
bool add_bos) {
|
||||||
auto escape = llama_vocab_get_type(model->vocab) == LLAMA_VOCAB_TYPE_SPM;
|
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
|
||||||
auto res = llama_tokenize_internal(model->vocab, text, add_bos, escape);
|
|
||||||
|
|
||||||
if (n_max_tokens < (int) res.size()) {
|
if (n_max_tokens < (int) res.size()) {
|
||||||
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
|
||||||
|
|
43
llama.h
43
llama.h
|
@ -254,7 +254,11 @@ extern "C" {
|
||||||
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
||||||
|
|
||||||
// Get a string describing the model type
|
// Get a string describing the model type
|
||||||
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
||||||
|
// Returns the total size of all the tensors in the model in bytes
|
||||||
|
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
||||||
|
// Returns the total number of parameters in the model
|
||||||
|
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
||||||
|
|
||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
LLAMA_API int llama_model_quantize(
|
LLAMA_API int llama_model_quantize(
|
||||||
|
@ -465,6 +469,43 @@ extern "C" {
|
||||||
/// @details Accepts the sampled token into the grammar
|
/// @details Accepts the sampled token into the grammar
|
||||||
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Beam search
|
||||||
|
//
|
||||||
|
|
||||||
|
struct llama_beam_view {
|
||||||
|
const llama_token * tokens;
|
||||||
|
size_t n_tokens;
|
||||||
|
float p; // Cumulative beam probability (renormalized relative to all beams)
|
||||||
|
bool eob; // Callback should set this to true when a beam is at end-of-beam.
|
||||||
|
};
|
||||||
|
|
||||||
|
// Passed to beam_search_callback function.
|
||||||
|
// Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
|
||||||
|
// (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
|
||||||
|
// These pointers are valid only during the synchronous callback, so should not be saved.
|
||||||
|
struct llama_beams_state {
|
||||||
|
struct llama_beam_view * beam_views;
|
||||||
|
size_t n_beams; // Number of elements in beam_views[].
|
||||||
|
size_t common_prefix_length; // Current max length of prefix tokens shared by all beams.
|
||||||
|
bool last_call; // True iff this is the last callback invocation.
|
||||||
|
};
|
||||||
|
|
||||||
|
// Type of pointer to the beam_search_callback function.
|
||||||
|
// void* callback_data is any custom data passed to llama_beam_search, that is subsequently
|
||||||
|
// passed back to beam_search_callback. This avoids having to use global variables in the callback.
|
||||||
|
typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, llama_beams_state);
|
||||||
|
|
||||||
|
/// @details Deterministically returns entire sentence constructed by a beam search.
|
||||||
|
/// @param ctx Pointer to the llama_context.
|
||||||
|
/// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
|
||||||
|
/// @param callback_data A pointer that is simply passed back to callback.
|
||||||
|
/// @param n_beams Number of beams to use.
|
||||||
|
/// @param n_past Number of tokens already evaluated.
|
||||||
|
/// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
|
||||||
|
/// @param n_threads Number of threads as passed to llama_eval().
|
||||||
|
LLAMA_API void llama_beam_search(struct llama_context * ctx, llama_beam_search_callback_fn_t callback, void * callback_data, size_t n_beams, int n_past, int n_predict, int n_threads);
|
||||||
|
|
||||||
// Performance information
|
// Performance information
|
||||||
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
||||||
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
||||||
|
|
|
@ -1,2 +1,3 @@
|
||||||
numpy==1.24
|
numpy==1.24
|
||||||
sentencepiece==0.1.98
|
sentencepiece==0.1.98
|
||||||
|
gguf>=0.1.0
|
||||||
|
|
|
@ -100,7 +100,8 @@ int main(int argc, char **argv) {
|
||||||
bool success = true;
|
bool success = true;
|
||||||
|
|
||||||
for (const auto & test_kv : k_tests()) {
|
for (const auto & test_kv : k_tests()) {
|
||||||
std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
|
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||||
|
std::vector<llama_token> res = llama_tokenize(ctx, " " + test_kv.first, true);
|
||||||
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
|
fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
|
||||||
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
|
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue