Merge branch 'master' into convert_rope_scale
This commit is contained in:
commit
4950b2dbbc
26 changed files with 2579 additions and 1275 deletions
44
.devops/full-rocm.Dockerfile
Normal file
44
.devops/full-rocm.Dockerfile
Normal file
|
@ -0,0 +1,44 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=5.6
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
ARG ROCM_DOCKER_ARCH=\
|
||||
gfx803 \
|
||||
gfx900 \
|
||||
gfx906 \
|
||||
gfx908 \
|
||||
gfx90a \
|
||||
gfx1010 \
|
||||
gfx1030 \
|
||||
gfx1100 \
|
||||
gfx1101 \
|
||||
gfx1102
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
# Enable ROCm
|
||||
ENV LLAMA_HIPBLAS=1
|
||||
ENV CC=/opt/rocm/llvm/bin/clang
|
||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
|
||||
RUN make
|
||||
|
||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
44
.devops/main-rocm.Dockerfile
Normal file
44
.devops/main-rocm.Dockerfile
Normal file
|
@ -0,0 +1,44 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=5.6
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
ARG ROCM_DOCKER_ARCH=\
|
||||
gfx803 \
|
||||
gfx900 \
|
||||
gfx906 \
|
||||
gfx908 \
|
||||
gfx90a \
|
||||
gfx1010 \
|
||||
gfx1030 \
|
||||
gfx1100 \
|
||||
gfx1101 \
|
||||
gfx1102
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
# Enable ROCm
|
||||
ENV LLAMA_HIPBLAS=1
|
||||
ENV CC=/opt/rocm/llvm/bin/clang
|
||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
|
||||
RUN make
|
||||
|
||||
ENTRYPOINT [ "/app/main" ]
|
|
@ -5,14 +5,7 @@
|
|||
.vscode/
|
||||
.DS_Store
|
||||
|
||||
build/
|
||||
build-em/
|
||||
build-debug/
|
||||
build-release/
|
||||
build-static/
|
||||
build-no-accel/
|
||||
build-sanitize-addr/
|
||||
build-sanitize-thread/
|
||||
build*/
|
||||
|
||||
models/*
|
||||
|
||||
|
|
17
.gitignore
vendored
17
.gitignore
vendored
|
@ -16,20 +16,7 @@
|
|||
.vs/
|
||||
.vscode/
|
||||
|
||||
build/
|
||||
build-em/
|
||||
build-debug/
|
||||
build-release/
|
||||
build-ci-debug/
|
||||
build-ci-release/
|
||||
build-static/
|
||||
build-cublas/
|
||||
build-opencl/
|
||||
build-metal/
|
||||
build-mpi/
|
||||
build-no-accel/
|
||||
build-sanitize-addr/
|
||||
build-sanitize-thread/
|
||||
build*/
|
||||
out/
|
||||
tmp/
|
||||
|
||||
|
@ -60,6 +47,7 @@ compile_commands.json
|
|||
CMakeSettings.json
|
||||
|
||||
__pycache__
|
||||
dist
|
||||
|
||||
zig-out/
|
||||
zig-cache/
|
||||
|
@ -70,7 +58,6 @@ perf-*.txt
|
|||
|
||||
examples/jeopardy/results.txt
|
||||
|
||||
pyproject.toml
|
||||
poetry.lock
|
||||
poetry.toml
|
||||
|
||||
|
|
|
@ -74,6 +74,7 @@ set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kern
|
|||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
|
@ -352,6 +353,43 @@ if (LLAMA_CLBLAST)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_HIPBLAS)
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
||||
|
||||
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
||||
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
|
||||
endif()
|
||||
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
|
||||
endif()
|
||||
|
||||
find_package(hip)
|
||||
find_package(hipblas)
|
||||
find_package(rocblas)
|
||||
|
||||
if (${hipblas_FOUND} AND ${hip_FOUND})
|
||||
message(STATUS "HIP and hipBLAS found")
|
||||
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
|
||||
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
|
||||
if (LLAMA_CUDA_FORCE_DMMV)
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
|
||||
endif()
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
|
||||
target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
|
||||
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
|
||||
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||
|
||||
if (LLAMA_STATIC)
|
||||
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
||||
endif()
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ggml-rocm)
|
||||
else()
|
||||
message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_ALL_WARNINGS)
|
||||
if (NOT MSVC)
|
||||
set(c_flags
|
||||
|
|
24
Makefile
24
Makefile
|
@ -280,6 +280,30 @@ ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
|||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # LLAMA_CLBLAST
|
||||
|
||||
ifdef LLAMA_HIPBLAS
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
HIPCC ?= $(ROCM_PATH)/bin/hipcc
|
||||
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
||||
LLAMA_CUDA_DMMV_X ?= 32
|
||||
LLAMA_CUDA_MMV_Y ?= 1
|
||||
LLAMA_CUDA_KQUANTS_ITER ?= 2
|
||||
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
|
||||
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
|
||||
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
||||
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
||||
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
|
||||
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
|
||||
HIPFLAGS += -DCC_TURING=1000000000
|
||||
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||
endif # LLAMA_CUDA_FORCE_DMMV
|
||||
OBJS += ggml-cuda.o
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
||||
endif # LLAMA_HIPBLAS
|
||||
|
||||
ifdef LLAMA_METAL
|
||||
CFLAGS += -DGGML_USE_METAL -DGGML_METAL_NDEBUG
|
||||
CXXFLAGS += -DGGML_USE_METAL
|
||||
|
|
29
README.md
29
README.md
|
@ -422,6 +422,35 @@ Building the program with BLAS support may lead to some performance improvements
|
|||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||
|
||||
- #### hipBLAS
|
||||
|
||||
This provide BLAS acceleation on HIP supported GPU like AMD GPU.
|
||||
Make sure to have ROCm installed.
|
||||
You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html).
|
||||
Windows support is coming soon...
|
||||
|
||||
- Using `make`:
|
||||
```bash
|
||||
make LLAMA_HIPBLAS=1
|
||||
```
|
||||
- Using `CMake`:
|
||||
```bash
|
||||
mkdir build
|
||||
cd build
|
||||
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DLLAMA_HIPBLAS=ON
|
||||
cmake --build .
|
||||
```
|
||||
|
||||
The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
|
||||
If your GPU is not officialy supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 or 11.0.0 on RDNA3.
|
||||
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
||||
|
||||
| Option | Legal values | Default | Description |
|
||||
|-------------------------|------------------------|---------|-------------|
|
||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||
|
||||
- #### CLBlast
|
||||
|
||||
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
|
||||
|
|
|
@ -391,6 +391,7 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
|||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||
|
||||
python3 -m pip install -r ${SRC}/requirements.txt
|
||||
python3 -m pip install --editable gguf-py
|
||||
fi
|
||||
|
||||
ret=0
|
||||
|
|
|
@ -613,9 +613,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||
fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
fprintf(stdout, " -nommq, --no-mul-mat-q\n");
|
||||
fprintf(stdout, " use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
|
||||
fprintf(stdout, " use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
|
||||
fprintf(stdout, " Not recommended since this is both slower and uses more VRAM.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#endif
|
||||
fprintf(stdout, " --mtest compute maximum memory usage\n");
|
||||
fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n");
|
||||
|
|
36
convert.py
36
convert.py
|
@ -161,13 +161,14 @@ class Params:
|
|||
def loadHFTransformerJson(model: 'LazyModel', config_path: 'Path') -> 'Params':
|
||||
config = json.load(open(config_path))
|
||||
|
||||
n_vocab = config["vocab_size"]
|
||||
n_embd = config["hidden_size"]
|
||||
n_layer = config["num_hidden_layers"]
|
||||
n_ff = config["intermediate_size"]
|
||||
n_head = config["num_attention_heads"]
|
||||
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
|
||||
f_norm_eps = config["rms_norm_eps"]
|
||||
n_vocab = config["vocab_size"]
|
||||
n_embd = config["hidden_size"]
|
||||
n_layer = config["num_hidden_layers"]
|
||||
n_ff = config["intermediate_size"]
|
||||
n_head = config["num_attention_heads"]
|
||||
n_head_kv = config["num_key_value_heads"] if "num_key_value_heads" in config else n_head
|
||||
f_norm_eps = config["rms_norm_eps"]
|
||||
f_rope_freq_base = config["rope_theta"] if "rope_theta" in config else None
|
||||
|
||||
if "rope_scaling" in config and config["rope_scaling"].get("type") == "linear":
|
||||
f_rope_scale = config["rope_scaling"].get("factor")
|
||||
|
@ -185,16 +186,17 @@ class Params:
|
|||
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||
|
||||
return Params(
|
||||
n_vocab = n_vocab,
|
||||
n_embd = n_embd,
|
||||
n_mult = n_mult,
|
||||
n_layer = n_layer,
|
||||
n_ctx = n_ctx,
|
||||
n_ff = n_ff,
|
||||
n_head = n_head,
|
||||
n_head_kv = n_head_kv,
|
||||
f_norm_eps = f_norm_eps,
|
||||
f_rope_scale = f_rope_scale,
|
||||
n_vocab = n_vocab,
|
||||
n_embd = n_embd,
|
||||
n_mult = n_mult,
|
||||
n_layer = n_layer,
|
||||
n_ctx = n_ctx,
|
||||
n_ff = n_ff,
|
||||
n_head = n_head,
|
||||
n_head_kv = n_head_kv,
|
||||
f_norm_eps = f_norm_eps,
|
||||
f_rope_freq_base = f_rope_freq_base,
|
||||
f_rope_scale = f_rope_scale,
|
||||
)
|
||||
|
||||
# LLaMA v2 70B params.json
|
||||
|
|
|
@ -18,9 +18,7 @@
|
|||
#include "llama.h"
|
||||
#include "common.h"
|
||||
#include "build-info.h"
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
// utils
|
||||
static uint64_t get_time_ns() {
|
||||
|
@ -443,6 +441,8 @@ struct test {
|
|||
static const std::string gpu_info;
|
||||
std::string model_filename;
|
||||
std::string model_type;
|
||||
uint64_t model_size;
|
||||
uint64_t model_n_params;
|
||||
int n_batch;
|
||||
int n_threads;
|
||||
bool f32_kv;
|
||||
|
@ -459,8 +459,10 @@ struct test {
|
|||
test(const cmd_params_instance & inst, const llama_model * lmodel, const llama_context * ctx) {
|
||||
model_filename = inst.model;
|
||||
char buf[128];
|
||||
llama_model_type(lmodel, buf, sizeof(buf));
|
||||
llama_model_desc(lmodel, buf, sizeof(buf));
|
||||
model_type = buf;
|
||||
model_size = llama_model_size(lmodel);
|
||||
model_n_params = llama_model_n_params(lmodel);
|
||||
n_batch = inst.n_batch;
|
||||
n_threads = inst.n_threads;
|
||||
f32_kv = inst.f32_kv;
|
||||
|
@ -504,7 +506,7 @@ struct test {
|
|||
|
||||
static std::string get_backend() {
|
||||
if (cuda) {
|
||||
return "CUDA";
|
||||
return GGML_CUDA_NAME;
|
||||
}
|
||||
if (opencl) {
|
||||
return "OpenCL";
|
||||
|
@ -526,7 +528,7 @@ struct test {
|
|||
"build_commit", "build_number",
|
||||
"cuda", "opencl", "metal", "gpu_blas", "blas",
|
||||
"cpu_info", "gpu_info",
|
||||
"model_filename", "model_type",
|
||||
"model_filename", "model_type", "model_size", "model_n_params",
|
||||
"n_batch", "n_threads", "f16_kv",
|
||||
"n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
|
||||
"n_prompt", "n_gen", "test_time",
|
||||
|
@ -540,6 +542,7 @@ struct test {
|
|||
|
||||
static field_type get_field_type(const std::string & field) {
|
||||
if (field == "build_number" || field == "n_batch" || field == "n_threads" ||
|
||||
field == "model_size" || field == "model_n_params" ||
|
||||
field == "n_gpu_layers" || field == "main_gpu" ||
|
||||
field == "n_prompt" || field == "n_gen" ||
|
||||
field == "avg_ns" || field == "stddev_ns") {
|
||||
|
@ -575,7 +578,7 @@ struct test {
|
|||
build_commit, std::to_string(build_number),
|
||||
std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
|
||||
cpu_info, gpu_info,
|
||||
model_filename, model_type,
|
||||
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
|
||||
std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
|
||||
std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
|
||||
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
||||
|
@ -711,8 +714,15 @@ struct markdown_printer : public printer {
|
|||
return -30;
|
||||
}
|
||||
if (field == "t/s") {
|
||||
return 15;
|
||||
return 16;
|
||||
}
|
||||
if (field == "size" || field == "params") {
|
||||
return 10;
|
||||
}
|
||||
if (field == "n_gpu_layers") {
|
||||
return 3;
|
||||
}
|
||||
|
||||
int width = std::max((int)field.length(), 10);
|
||||
|
||||
if (test::get_field_type(field) == test::STRING) {
|
||||
|
@ -721,9 +731,28 @@ struct markdown_printer : public printer {
|
|||
return width;
|
||||
}
|
||||
|
||||
static std::string get_field_display_name(const std::string & field) {
|
||||
if (field == "n_gpu_layers") {
|
||||
return "ngl";
|
||||
}
|
||||
if (field == "n_threads") {
|
||||
return "threads";
|
||||
}
|
||||
if (field == "mul_mat_q") {
|
||||
return "mmq";
|
||||
}
|
||||
if (field == "tensor_split") {
|
||||
return "ts";
|
||||
}
|
||||
return field;
|
||||
}
|
||||
|
||||
void print_header(const cmd_params & params) override {
|
||||
// select fields to print
|
||||
fields = { "model", "backend" };
|
||||
fields.push_back("model");
|
||||
fields.push_back("size");
|
||||
fields.push_back("params");
|
||||
fields.push_back("backend");
|
||||
bool is_cpu_backend = test::get_backend() == "CPU" || test::get_backend() == "BLAS";
|
||||
if (!is_cpu_backend) {
|
||||
fields.push_back("n_gpu_layers");
|
||||
|
@ -754,7 +783,7 @@ struct markdown_printer : public printer {
|
|||
|
||||
fprintf(fout, "|");
|
||||
for (const auto & field : fields) {
|
||||
fprintf(fout, " %*s |", get_field_width(field), field.c_str());
|
||||
fprintf(fout, " %*s |", get_field_width(field), get_field_display_name(field).c_str());
|
||||
}
|
||||
fprintf(fout, "\n");
|
||||
fprintf(fout, "|");
|
||||
|
@ -771,12 +800,26 @@ struct markdown_printer : public printer {
|
|||
fprintf(fout, "|");
|
||||
for (const auto & field : fields) {
|
||||
std::string value;
|
||||
char buf[128];
|
||||
if (field == "model") {
|
||||
value = t.model_type;
|
||||
} else if (field == "size") {
|
||||
if (t.model_size < 1024*1024*1024) {
|
||||
snprintf(buf, sizeof(buf), "%.2f MiB", t.model_size / 1024.0 / 1024.0);
|
||||
} else {
|
||||
snprintf(buf, sizeof(buf), "%.2f GiB", t.model_size / 1024.0 / 1024.0 / 1024.0);
|
||||
}
|
||||
value = buf;
|
||||
} else if (field == "params") {
|
||||
if (t.model_n_params < 1000*1000*1000) {
|
||||
snprintf(buf, sizeof(buf), "%.2f M", t.model_n_params / 1e6);
|
||||
} else {
|
||||
snprintf(buf, sizeof(buf), "%.2f B", t.model_n_params / 1e9);
|
||||
}
|
||||
value = buf;
|
||||
} else if (field == "backend") {
|
||||
value = test::get_backend();
|
||||
} else if (field == "test") {
|
||||
char buf[128];
|
||||
if (t.n_prompt > 0 && t.n_gen == 0) {
|
||||
snprintf(buf, sizeof(buf), "pp %d", t.n_prompt);
|
||||
} else if (t.n_gen > 0 && t.n_prompt == 0) {
|
||||
|
@ -787,7 +830,6 @@ struct markdown_printer : public printer {
|
|||
}
|
||||
value = buf;
|
||||
} else if (field == "t/s") {
|
||||
char buf[128];
|
||||
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
||||
value = buf;
|
||||
} else if (vmap.find(field) != vmap.end()) {
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -102,6 +102,17 @@
|
|||
padding: 0.5em;
|
||||
}
|
||||
|
||||
.prob-set {
|
||||
padding: 0.3em;
|
||||
border-bottom: 1px solid #ccc;
|
||||
}
|
||||
|
||||
.popover-content {
|
||||
position: absolute;
|
||||
background-color: white;
|
||||
padding: 0.2em;
|
||||
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
|
||||
}
|
||||
|
||||
textarea {
|
||||
padding: 5px;
|
||||
|
@ -133,11 +144,17 @@
|
|||
font-size: 80%;
|
||||
color: #888;
|
||||
}
|
||||
|
||||
@media (prefers-color-scheme: dark) {
|
||||
.popover-content {
|
||||
background-color: black;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
|
||||
<script type="module">
|
||||
import {
|
||||
html, h, signal, effect, computed, render, useSignal, useEffect, useRef
|
||||
html, h, signal, effect, computed, render, useSignal, useEffect, useRef, Component
|
||||
} from '/index.js';
|
||||
|
||||
import { llama } from '/completion.js';
|
||||
|
@ -168,6 +185,7 @@
|
|||
mirostat_tau: 5, // target entropy
|
||||
mirostat_eta: 0.1, // learning rate
|
||||
grammar: '',
|
||||
n_probs: 0, // no completion_probabilities
|
||||
})
|
||||
|
||||
/* START: Support for storing prompt templates and parameters in borwser LocalStorage */
|
||||
|
@ -334,10 +352,21 @@
|
|||
|
||||
const prompt = template(session.value.template, {
|
||||
message: msg,
|
||||
history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"),
|
||||
history: session.value.transcript.flatMap(
|
||||
([name, data]) =>
|
||||
template(
|
||||
session.value.historyTemplate,
|
||||
{
|
||||
name,
|
||||
message: Array.isArray(data) ?
|
||||
data.map(msg => msg.content).join('').replace(/^\s/, '') :
|
||||
data,
|
||||
}
|
||||
)
|
||||
).join("\n"),
|
||||
});
|
||||
|
||||
let currentMessage = '';
|
||||
const currentMessages = [];
|
||||
const history = session.value.transcript
|
||||
|
||||
const llamaParams = {
|
||||
|
@ -347,15 +376,19 @@
|
|||
|
||||
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
|
||||
const data = chunk.data;
|
||||
currentMessage += data.content;
|
||||
|
||||
// remove leading whitespace
|
||||
currentMessage = currentMessage.replace(/^\s+/, "")
|
||||
|
||||
transcriptUpdate([...history, ["{{char}}", currentMessage]])
|
||||
|
||||
if (data.stop) {
|
||||
console.log("Completion finished: '", currentMessage, "', summary: ", data);
|
||||
while (
|
||||
currentMessages.length > 0 &&
|
||||
currentMessages[currentMessages.length - 1].content.match(/\n$/) != null
|
||||
) {
|
||||
currentMessages.pop();
|
||||
}
|
||||
transcriptUpdate([...history, ["{{char}}", currentMessages]])
|
||||
console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
|
||||
} else {
|
||||
currentMessages.push(data);
|
||||
transcriptUpdate([...history, ["{{char}}", currentMessages]])
|
||||
}
|
||||
|
||||
if (data.timings) {
|
||||
|
@ -420,8 +453,18 @@
|
|||
}
|
||||
}, [messages])
|
||||
|
||||
const chatLine = ([user, msg]) => {
|
||||
return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdownish} text=${template(msg)} /></p>`
|
||||
const chatLine = ([user, data], index) => {
|
||||
let message
|
||||
const isArrayMessage = Array.isArray(data)
|
||||
if (params.value.n_probs > 0 && isArrayMessage) {
|
||||
message = html`<${Probabilities} data=${data} />`
|
||||
} else {
|
||||
const text = isArrayMessage ?
|
||||
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
|
||||
data;
|
||||
message = html`<${Markdownish} text=${template(text)} />`
|
||||
}
|
||||
return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
|
||||
};
|
||||
|
||||
return html`
|
||||
|
@ -568,10 +611,71 @@
|
|||
${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
|
||||
${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
|
||||
</fieldset>
|
||||
<fieldset>
|
||||
${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
|
||||
</fieldset>
|
||||
</details>
|
||||
</form>
|
||||
`
|
||||
}
|
||||
|
||||
const probColor = (p) => {
|
||||
const r = Math.floor(192 * (1 - p));
|
||||
const g = Math.floor(192 * p);
|
||||
return `rgba(${r},${g},0,0.3)`;
|
||||
}
|
||||
|
||||
const Probabilities = (params) => {
|
||||
return params.data.map(msg => {
|
||||
const { completion_probabilities } = msg;
|
||||
if (
|
||||
!completion_probabilities ||
|
||||
completion_probabilities.length === 0
|
||||
) return msg.content
|
||||
|
||||
if (completion_probabilities.length > 1) {
|
||||
// Not for byte pair
|
||||
if (completion_probabilities[0].content.startsWith('byte: \\')) return msg.content
|
||||
|
||||
const splitData = completion_probabilities.map(prob => ({
|
||||
content: prob.content,
|
||||
completion_probabilities: [prob]
|
||||
}))
|
||||
return html`<${Probabilities} data=${splitData} />`
|
||||
}
|
||||
|
||||
const { probs, content } = completion_probabilities[0]
|
||||
const found = probs.find(p => p.tok_str === msg.content)
|
||||
const pColor = found ? probColor(found.prob) : 'transparent'
|
||||
|
||||
const popoverChildren = html`
|
||||
<div class="prob-set">
|
||||
${probs.map((p, index) => {
|
||||
return html`
|
||||
<div
|
||||
key=${index}
|
||||
title=${`prob: ${p.prob}`}
|
||||
style=${{
|
||||
padding: '0.3em',
|
||||
backgroundColor: p.tok_str === content ? probColor(p.prob) : 'transparent'
|
||||
}}
|
||||
>
|
||||
<span>${p.tok_str}: </span>
|
||||
<span>${Math.floor(p.prob * 100)}%</span>
|
||||
</div>
|
||||
`
|
||||
})}
|
||||
</div>
|
||||
`
|
||||
|
||||
return html`
|
||||
<${Popover} style=${{ backgroundColor: pColor }} popoverChildren=${popoverChildren}>
|
||||
${msg.content.match(/\n/gim) ? html`<br />` : msg.content}
|
||||
</>
|
||||
`
|
||||
});
|
||||
}
|
||||
|
||||
// poor mans markdown replacement
|
||||
const Markdownish = (params) => {
|
||||
const md = params.text
|
||||
|
@ -600,10 +704,121 @@
|
|||
`
|
||||
}
|
||||
|
||||
// simple popover impl
|
||||
const Popover = (props) => {
|
||||
const isOpen = useSignal(false);
|
||||
const position = useSignal({ top: '0px', left: '0px' });
|
||||
const buttonRef = useRef(null);
|
||||
const popoverRef = useRef(null);
|
||||
|
||||
const togglePopover = () => {
|
||||
if (buttonRef.current) {
|
||||
const rect = buttonRef.current.getBoundingClientRect();
|
||||
position.value = {
|
||||
top: `${rect.bottom + window.scrollY}px`,
|
||||
left: `${rect.left + window.scrollX}px`,
|
||||
};
|
||||
}
|
||||
isOpen.value = !isOpen.value;
|
||||
};
|
||||
|
||||
const handleClickOutside = (event) => {
|
||||
if (popoverRef.current && !popoverRef.current.contains(event.target) && !buttonRef.current.contains(event.target)) {
|
||||
isOpen.value = false;
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
document.addEventListener('mousedown', handleClickOutside);
|
||||
return () => {
|
||||
document.removeEventListener('mousedown', handleClickOutside);
|
||||
};
|
||||
}, []);
|
||||
|
||||
return html`
|
||||
<span style=${props.style} ref=${buttonRef} onClick=${togglePopover}>${props.children}</span>
|
||||
${isOpen.value && html`
|
||||
<${Portal} into="#portal">
|
||||
<div
|
||||
ref=${popoverRef}
|
||||
class="popover-content"
|
||||
style=${{
|
||||
top: position.value.top,
|
||||
left: position.value.left,
|
||||
}}
|
||||
>
|
||||
${props.popoverChildren}
|
||||
</div>
|
||||
</${Portal}>
|
||||
`}
|
||||
`;
|
||||
};
|
||||
|
||||
// Source: preact-portal (https://github.com/developit/preact-portal/blob/master/src/preact-portal.js)
|
||||
/** Redirect rendering of descendants into the given CSS selector */
|
||||
class Portal extends Component {
|
||||
componentDidUpdate(props) {
|
||||
for (let i in props) {
|
||||
if (props[i] !== this.props[i]) {
|
||||
return setTimeout(this.renderLayer);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
componentDidMount() {
|
||||
this.isMounted = true;
|
||||
this.renderLayer = this.renderLayer.bind(this);
|
||||
this.renderLayer();
|
||||
}
|
||||
|
||||
componentWillUnmount() {
|
||||
this.renderLayer(false);
|
||||
this.isMounted = false;
|
||||
if (this.remote && this.remote.parentNode) this.remote.parentNode.removeChild(this.remote);
|
||||
}
|
||||
|
||||
findNode(node) {
|
||||
return typeof node === 'string' ? document.querySelector(node) : node;
|
||||
}
|
||||
|
||||
renderLayer(show = true) {
|
||||
if (!this.isMounted) return;
|
||||
|
||||
// clean up old node if moving bases:
|
||||
if (this.props.into !== this.intoPointer) {
|
||||
this.intoPointer = this.props.into;
|
||||
if (this.into && this.remote) {
|
||||
this.remote = render(html`<${PortalProxy} />`, this.into, this.remote);
|
||||
}
|
||||
this.into = this.findNode(this.props.into);
|
||||
}
|
||||
|
||||
this.remote = render(html`
|
||||
<${PortalProxy} context=${this.context}>
|
||||
${show && this.props.children || null}
|
||||
</${PortalProxy}>
|
||||
`, this.into, this.remote);
|
||||
}
|
||||
|
||||
render() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
// high-order component that renders its first child if it exists.
|
||||
// used as a conditional rendering proxy.
|
||||
class PortalProxy extends Component {
|
||||
getChildContext() {
|
||||
return this.props.context;
|
||||
}
|
||||
render({ children }) {
|
||||
return children || null;
|
||||
}
|
||||
}
|
||||
|
||||
function App(props) {
|
||||
|
||||
return html`
|
||||
<div id="container">
|
||||
<div>
|
||||
<header>
|
||||
<h1>llama.cpp</h1>
|
||||
</header>
|
||||
|
@ -624,11 +839,13 @@
|
|||
`;
|
||||
}
|
||||
|
||||
render(h(App), document.body);
|
||||
render(h(App), document.querySelector('#container'));
|
||||
</script>
|
||||
</head>
|
||||
|
||||
<body>
|
||||
<div id="container"></div>
|
||||
<div id="portal"></div>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
|
|
|
@ -124,8 +124,9 @@ static void server_log(const char *level, const char *function, int line,
|
|||
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
||||
{
|
||||
std::string out = token == -1 ? "" : llama_token_to_str(ctx, token);
|
||||
// if first bit is 1, meaning it's a partial character
|
||||
if (out.size() > 0 && (out[0] & 0x80) == 0x80)
|
||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||
// (size > 1 meaning it's already a known token)
|
||||
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << std::hex << (out[0] & 0xff);
|
||||
|
@ -1321,59 +1322,86 @@ int main(int argc, char **argv)
|
|||
|
||||
while (llama.has_next_token) {
|
||||
const completion_token_output token_with_probs = llama.doCompletion();
|
||||
const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_str(llama.ctx, token_with_probs.tok);
|
||||
if (llama.multibyte_pending > 0) {
|
||||
if (token_with_probs.tok == -1 || llama.multibyte_pending > 0) {
|
||||
continue;
|
||||
}
|
||||
const std::string token_text = llama_token_to_str(llama.ctx, token_with_probs.tok);
|
||||
|
||||
size_t pos = std::min(sent_count, llama.generated_text.size());
|
||||
|
||||
const std::string str_test = llama.generated_text.substr(pos);
|
||||
bool is_stop_full = false;
|
||||
size_t stop_pos =
|
||||
llama.findStoppingStrings(str_test, token_text.size(), STOP_FULL);
|
||||
if (stop_pos != std::string::npos) {
|
||||
is_stop_full = true;
|
||||
llama.generated_text.erase(
|
||||
llama.generated_text.begin() + pos + stop_pos,
|
||||
llama.generated_text.end());
|
||||
pos = std::min(sent_count, llama.generated_text.size());
|
||||
} else {
|
||||
is_stop_full = false;
|
||||
stop_pos = llama.findStoppingStrings(str_test, token_text.size(),
|
||||
STOP_PARTIAL);
|
||||
}
|
||||
|
||||
const std::string to_send = llama.generated_text.substr(pos, stop_pos);
|
||||
sent_count += to_send.size();
|
||||
if (
|
||||
stop_pos == std::string::npos ||
|
||||
// Send rest of the text if we are at the end of the generation
|
||||
(!llama.has_next_token && !is_stop_full && stop_pos > 0)
|
||||
) {
|
||||
const std::string to_send = llama.generated_text.substr(pos, std::string::npos);
|
||||
|
||||
std::vector<completion_token_output> probs_output = {};
|
||||
sent_count += to_send.size();
|
||||
|
||||
if (llama.params.n_probs > 0) {
|
||||
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
||||
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
||||
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
||||
if (probs_pos < probs_stop_pos) {
|
||||
probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
|
||||
std::vector<completion_token_output> probs_output = {};
|
||||
|
||||
if (llama.params.n_probs > 0) {
|
||||
const std::vector<llama_token> to_send_toks = llama_tokenize(llama.ctx, to_send, false);
|
||||
size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size());
|
||||
size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size());
|
||||
if (probs_pos < probs_stop_pos) {
|
||||
probs_output = std::vector<completion_token_output>(llama.generated_token_probs.begin() + probs_pos, llama.generated_token_probs.begin() + probs_stop_pos);
|
||||
}
|
||||
sent_token_probs_index = probs_stop_pos;
|
||||
}
|
||||
|
||||
const json data = format_partial_response(llama, to_send, probs_output);
|
||||
|
||||
const std::string str =
|
||||
"data: " +
|
||||
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||
"\n\n";
|
||||
|
||||
LOG_VERBOSE("data stream", {
|
||||
{ "to_send", str }
|
||||
});
|
||||
|
||||
if (!sink.write(str.data(), str.size())) {
|
||||
LOG_VERBOSE("stream closed", {});
|
||||
llama_print_timings(llama.ctx);
|
||||
return false;
|
||||
}
|
||||
sent_token_probs_index = probs_stop_pos;
|
||||
}
|
||||
|
||||
const json data = llama.has_next_token
|
||||
? format_partial_response(llama, to_send, probs_output)
|
||||
// Generation is done, send extra information.
|
||||
: format_final_response(llama, to_send, llama.generated_token_probs);
|
||||
if (!llama.has_next_token) {
|
||||
// Generation is done, send extra information.
|
||||
const json data = format_final_response(llama, "", llama.generated_token_probs);
|
||||
|
||||
const std::string str =
|
||||
"data: " +
|
||||
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||
"\n\n";
|
||||
const std::string str =
|
||||
"data: " +
|
||||
data.dump(-1, ' ', false, json::error_handler_t::replace) +
|
||||
"\n\n";
|
||||
|
||||
LOG_VERBOSE("data stream", {
|
||||
{ "to_send", str }
|
||||
});
|
||||
LOG_VERBOSE("data stream", {
|
||||
{ "to_send", str }
|
||||
});
|
||||
|
||||
if (!sink.write(str.data(), str.size())) {
|
||||
LOG_VERBOSE("stream closed", {});
|
||||
llama_print_timings(llama.ctx);
|
||||
return false;
|
||||
if (!sink.write(str.data(), str.size())) {
|
||||
LOG_VERBOSE("stream closed", {});
|
||||
llama_print_timings(llama.ctx);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
|
||||
#define UNUSED(x) (void)(x)
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define GGML_MAX_CONCUR (2*GGML_MAX_NODES)
|
||||
|
||||
//#define GGML_ALLOCATOR_DEBUG
|
||||
|
||||
|
@ -67,7 +68,7 @@ struct ggml_allocr {
|
|||
struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
|
||||
size_t max_size;
|
||||
bool measure;
|
||||
int parse_seq[GGML_MAX_NODES];
|
||||
int parse_seq[GGML_MAX_CONCUR];
|
||||
int parse_seq_len;
|
||||
|
||||
#ifdef GGML_ALLOCATOR_DEBUG
|
||||
|
|
231
ggml-cuda.cu
231
ggml-cuda.cu
|
@ -6,15 +6,116 @@
|
|||
#include <atomic>
|
||||
#include <assert.h>
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hipblas/hipblas.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// for rocblas_initialize()
|
||||
#include "rocblas/rocblas.h"
|
||||
#endif
|
||||
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
|
||||
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
|
||||
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
|
||||
#define CUBLAS_OP_N HIPBLAS_OP_N
|
||||
#define CUBLAS_OP_T HIPBLAS_OP_T
|
||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
||||
#define CUDA_R_16F HIPBLAS_R_16F
|
||||
#define CUDA_R_32F HIPBLAS_R_32F
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define cublasCreate hipblasCreate
|
||||
#define cublasGemmEx hipblasGemmEx
|
||||
#define cublasHandle_t hipblasHandle_t
|
||||
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||
#define cublasSetStream hipblasSetStream
|
||||
#define cublasSgemm hipblasSgemm
|
||||
#define cublasStatus_t hipblasStatus_t
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
#define cudaError_t hipError_t
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
#define cudaEventDisableTiming hipEventDisableTiming
|
||||
#define cudaEventRecord hipEventRecord
|
||||
#define cudaEvent_t hipEvent_t
|
||||
#define cudaEventDestroy hipEventDestroy
|
||||
#define cudaFree hipFree
|
||||
#define cudaFreeHost hipHostFree
|
||||
#define cudaGetDevice hipGetDevice
|
||||
#define cudaGetDeviceCount hipGetDeviceCount
|
||||
#define cudaGetDeviceProperties hipGetDeviceProperties
|
||||
#define cudaGetErrorString hipGetErrorString
|
||||
#define cudaGetLastError hipGetLastError
|
||||
#define cudaMalloc hipMalloc
|
||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||
#define cudaMemcpy hipMemcpy
|
||||
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
||||
#define cudaMemcpyAsync hipMemcpyAsync
|
||||
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||
#define cudaMemcpyKind hipMemcpyKind
|
||||
#define cudaMemset hipMemset
|
||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||
#define cudaSetDevice hipSetDevice
|
||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||
#define cudaStreamNonBlocking hipStreamNonBlocking
|
||||
#define cudaStreamSynchronize hipStreamSynchronize
|
||||
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
|
||||
#define cudaStream_t hipStream_t
|
||||
#define cudaSuccess hipSuccess
|
||||
#else
|
||||
#include <cuda_runtime.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
|
||||
#ifndef CC_TURING
|
||||
#define CC_TURING 700
|
||||
#endif
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#define __CUDA_ARCH__ 1300
|
||||
|
||||
typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
|
||||
static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||
const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
|
||||
return reinterpret_cast<const int&>(c);
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
#elif defined(__gfx1100__)
|
||||
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
||||
#elif defined(__gfx1010__) || defined(__gfx900__)
|
||||
int tmp1;
|
||||
int tmp2;
|
||||
asm("\n \
|
||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
|
||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
|
||||
v_add3_u32 %0, %1, %2, %0 \n \
|
||||
v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
|
||||
v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
|
||||
v_add3_u32 %0, %1, %2, %0 \n \
|
||||
"
|
||||
: "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
|
||||
: "v"(a), "v"(b)
|
||||
);
|
||||
#else
|
||||
const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
|
||||
const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
|
||||
c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
|
||||
#endif
|
||||
return c;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
|
@ -424,8 +525,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
|
|||
static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
const block_q4_1 * x = (const block_q4_1 *) vx;
|
||||
|
||||
const dfloat d = x[ib].dm.x;
|
||||
const dfloat m = x[ib].dm.y;
|
||||
const dfloat d = __low2half(x[ib].dm);
|
||||
const dfloat m = __high2half(x[ib].dm);
|
||||
|
||||
const int vui = x[ib].qs[iqs];
|
||||
|
||||
|
@ -467,8 +568,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
|
|||
static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
|
||||
const block_q5_1 * x = (const block_q5_1 *) vx;
|
||||
|
||||
const dfloat d = x[ib].dm.x;
|
||||
const dfloat m = x[ib].dm.y;
|
||||
const dfloat d = __low2half(x[ib].dm);
|
||||
const dfloat m = __high2half(x[ib].dm);
|
||||
|
||||
uint32_t qh;
|
||||
memcpy(&qh, x[ib].qh, sizeof(qh));
|
||||
|
@ -520,8 +621,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|||
const uint8_t q = x[i].qs[32*n + l];
|
||||
float * y = yy + i*QK_K + 128*n;
|
||||
|
||||
float dall = x[i].dm.x;
|
||||
float dmin = x[i].dm.y;
|
||||
float dall = __low2half(x[i].dm);
|
||||
float dmin = __high2half(x[i].dm);
|
||||
y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
||||
y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||
y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
|
||||
|
@ -531,8 +632,8 @@ static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float
|
|||
const int il = tid%16; // 0...15
|
||||
const uint8_t q = x[i].qs[il] >> (2*is);
|
||||
float * y = yy + i*QK_K + 16*is + il;
|
||||
float dall = x[i].dm.x;
|
||||
float dmin = x[i].dm.y;
|
||||
float dall = __low2half(x[i].dm);
|
||||
float dmin = __high2half(x[i].dm);
|
||||
y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
|
||||
y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
|
||||
#endif
|
||||
|
@ -618,8 +719,8 @@ static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float
|
|||
|
||||
float * y = yy + i*QK_K + 64*il + n*ir;
|
||||
|
||||
const float dall = x[i].dm.x;
|
||||
const float dmin = x[i].dm.y;
|
||||
const float dall = __low2half(x[i].dm);
|
||||
const float dmin = __high2half(x[i].dm);
|
||||
|
||||
const uint8_t * q = x[i].qs + 32*il + n*ir;
|
||||
|
||||
|
@ -657,8 +758,8 @@ static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float
|
|||
|
||||
float * y = yy + i*QK_K + 64*il + 2*ir;
|
||||
|
||||
const float dall = x[i].dm.x;
|
||||
const float dmin = x[i].dm.y;
|
||||
const float dall = __low2half(x[i].dm);
|
||||
const float dmin = __high2half(x[i].dm);
|
||||
|
||||
const uint8_t * ql = x[i].qs + 32*il + 2*ir;
|
||||
const uint8_t * qh = x[i].qh + 2*ir;
|
||||
|
@ -770,8 +871,8 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|||
const float * y = yy + i * QK_K + y_offset;
|
||||
const uint8_t * q = x[i].qs + q_offset;
|
||||
|
||||
const float dall = x[i].dm.x;
|
||||
const float dmin = x[i].dm.y;
|
||||
const float dall = __low2half(x[i].dm);
|
||||
const float dmin = __high2half(x[i].dm);
|
||||
|
||||
const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
|
||||
aux[0] = a[0] & 0x0f0f0f0f;
|
||||
|
@ -991,8 +1092,8 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
|
|||
const float * y1 = yy + i*QK_K + y_offset;
|
||||
const float * y2 = y1 + 128;
|
||||
|
||||
const float dall = x[i].dm.x;
|
||||
const float dmin = x[i].dm.y;
|
||||
const float dall = __low2half(x[i].dm);
|
||||
const float dmin = __high2half(x[i].dm);
|
||||
|
||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||
aux[0] = a[im+0] & kmask1;
|
||||
|
@ -1124,8 +1225,8 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
|
|||
const float * y1 = yy + i*QK_K + y_offset;
|
||||
const float * y2 = y1 + 128;
|
||||
|
||||
const float dall = x[i].dm.x;
|
||||
const float dmin = x[i].dm.y;
|
||||
const float dall = __low2half(x[i].dm);
|
||||
const float dmin = __high2half(x[i].dm);
|
||||
|
||||
const uint16_t * a = (const uint16_t *)x[i].scales;
|
||||
aux[0] = a[im+0] & kmask1;
|
||||
|
@ -1348,8 +1449,8 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
|
|||
return;
|
||||
}
|
||||
|
||||
y[ib].ds.x = d;
|
||||
y[ib].ds.y = sum;
|
||||
reinterpret_cast<half&>(y[ib].ds.x) = d;
|
||||
reinterpret_cast<half&>(y[ib].ds.y) = sum;
|
||||
}
|
||||
|
||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||
|
@ -2346,7 +2447,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
|
|||
u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
|
||||
}
|
||||
|
||||
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, bq8_1->ds.x);
|
||||
return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d, __low2half(bq8_1->ds));
|
||||
}
|
||||
|
||||
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
|
||||
|
@ -2432,7 +2533,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|||
#pragma unroll
|
||||
for (int i = 0; i < QR2_K; ++ i) {
|
||||
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
||||
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
||||
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
||||
}
|
||||
|
||||
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
||||
|
@ -2551,7 +2652,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|||
#pragma unroll
|
||||
for (int i = 0; i < QR3_K; ++i) {
|
||||
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
||||
d8[i] = bq8_1[bq8_offset + i].ds.x;
|
||||
d8[i] = __low2half(bq8_1[bq8_offset + i].ds);
|
||||
}
|
||||
|
||||
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
||||
|
@ -2720,7 +2821,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|||
|
||||
for (int i = 0; i < QR4_K; ++i) {
|
||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||
d8[i] = bq8i->ds.x;
|
||||
d8[i] = __low2half(bq8i->ds);
|
||||
|
||||
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
||||
u[2*i+0] = q8[0];
|
||||
|
@ -2747,8 +2848,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|||
const float dall = bq4_K->d[0];
|
||||
const float dmin = bq4_K->d[1];
|
||||
|
||||
const float d8_1 = bq8_1[0].ds.x;
|
||||
const float d8_2 = bq8_1[1].ds.x;
|
||||
const float d8_1 = __low2float(bq8_1[0].ds);
|
||||
const float d8_2 = __low2float(bq8_1[1].ds);
|
||||
|
||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
||||
|
@ -2901,7 +3002,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|||
#pragma unroll
|
||||
for (int i = 0; i < QR5_K; ++i) {
|
||||
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
||||
d8[i] = bq8i->ds.x;
|
||||
d8[i] = __low2float(bq8i->ds);
|
||||
|
||||
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
||||
u[2*i+0] = q8[0];
|
||||
|
@ -2919,8 +3020,8 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|||
|
||||
const float d = bq5_K->d;
|
||||
|
||||
const float d8_1 = bq8_1[0].ds.x;
|
||||
const float d8_2 = bq8_1[1].ds.x;
|
||||
const float d8_1 = __low2half(bq8_1[0].ds);
|
||||
const float d8_2 = __low2half(bq8_1[1].ds);
|
||||
|
||||
const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
|
||||
const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
|
||||
|
@ -3075,7 +3176,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|||
#pragma unroll
|
||||
for (int i = 0; i < QR6_K; ++i) {
|
||||
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
||||
d8[i] = bq8_1[bq8_offset + 2*i].ds.x;
|
||||
d8[i] = __low2half(bq8_1[bq8_offset + 2*i].ds);
|
||||
}
|
||||
|
||||
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
||||
|
@ -3243,7 +3344,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|||
*dsi_dst = *dsi_src;
|
||||
} else {
|
||||
float * dfi_dst = (float *) dsi_dst;
|
||||
*dfi_dst = (*dsi_src).x;
|
||||
*dfi_dst = __low2half(*dsi_src);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -3907,28 +4008,27 @@ static __global__ void rope_f32(const float * x, float * dst, const int ncols, c
|
|||
dst[i + 1] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
// TODO: this implementation is wrong!
|
||||
//static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
||||
// const float p_delta, const int p_delta_rows, const float theta_scale) {
|
||||
// const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
//
|
||||
// if (col >= ncols) {
|
||||
// return;
|
||||
// }
|
||||
//
|
||||
// const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
// const int i = row*ncols + col/2;
|
||||
//
|
||||
// const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
||||
// const float sin_theta = sinf(theta);
|
||||
// const float cos_theta = cosf(theta);
|
||||
//
|
||||
// const float x0 = x[i + 0];
|
||||
// const float x1 = x[i + ncols/2];
|
||||
//
|
||||
// dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
// dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
||||
//}
|
||||
static __global__ void rope_neox_f32(const float * x, float * dst, const int ncols, const float p0,
|
||||
const float p_delta, const int p_delta_rows, const float theta_scale) {
|
||||
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||
|
||||
if (col >= ncols) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int row = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
const int i = row*ncols + col/2;
|
||||
|
||||
const float theta = (p0 + p_delta * (row/p_delta_rows))*powf(theta_scale, col/2);
|
||||
const float sin_theta = sinf(theta);
|
||||
const float cos_theta = cosf(theta);
|
||||
|
||||
const float x0 = x[i + 0];
|
||||
const float x1 = x[i + ncols/2];
|
||||
|
||||
dst[i + 0] = x0*cos_theta - x1*sin_theta;
|
||||
dst[i + ncols/2] = x0*sin_theta + x1*cos_theta;
|
||||
}
|
||||
|
||||
static __global__ void rope_glm_f32(const float * x, float * dst, const int ncols, const float p, const float block_p, const float theta_scale) {
|
||||
const int col = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
@ -4799,13 +4899,21 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons
|
|||
|
||||
static void rope_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
||||
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
||||
GGML_ASSERT(nrows % 2 == 0);
|
||||
GGML_ASSERT(nrows % 2 == 0); // GG: is this assert really needed? I don't see why
|
||||
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nrows, num_blocks_x, 1);
|
||||
rope_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
||||
}
|
||||
|
||||
static void rope_neox_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p0,
|
||||
const float p_delta, const int p_delta_rows, const float theta_scale, cudaStream_t stream) {
|
||||
const dim3 block_dims(1, 2*CUDA_ROPE_BLOCK_SIZE, 1);
|
||||
const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
|
||||
const dim3 block_nums(nrows, num_blocks_x, 1);
|
||||
rope_neox_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, p0, p_delta, p_delta_rows, theta_scale);
|
||||
}
|
||||
|
||||
static void rope_glm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float p, const float block_p, const float theta_scale, cudaStream_t stream) {
|
||||
GGML_ASSERT(nrows % 4 == 0);
|
||||
const dim3 block_dims(4*CUDA_ROPE_BLOCK_SIZE, 1, 1);
|
||||
|
@ -4937,10 +5045,18 @@ void ggml_init_cublas() {
|
|||
static bool initialized = false;
|
||||
|
||||
if (!initialized) {
|
||||
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
|
||||
rocblas_initialize();
|
||||
CUDA_CHECK(cudaDeviceSynchronize());
|
||||
#endif
|
||||
|
||||
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
||||
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
||||
int64_t total_vram = 0;
|
||||
fprintf(stderr, "%s: found %d CUDA devices:\n", __func__, g_device_count);
|
||||
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||
|
@ -5548,8 +5664,9 @@ inline void ggml_cuda_op_rope(
|
|||
const float block_p = max(p - (n_ctx - 2.f), 0.f);
|
||||
rope_glm_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, id_p, block_p, theta_scale, cudaStream_main);
|
||||
} else if (is_neox) {
|
||||
GGML_ASSERT(false && "RoPE NeoX not implemented yet");
|
||||
#pragma message("TODO: implement RoPE NeoX for CUDA")
|
||||
GGML_ASSERT(ne00 == n_dims && "ne00 != n_dims is not implemented for CUDA yet");
|
||||
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
||||
rope_neox_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
||||
} else {
|
||||
const float p0 = (((mode & 1) == 0 ? n_past : 0)) * freq_scale;
|
||||
rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p0, freq_scale, ne01, theta_scale, cudaStream_main);
|
||||
|
|
|
@ -2,6 +2,14 @@
|
|||
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef GGML_USE_HIPBLAS
|
||||
#define GGML_CUDA_NAME "ROCm"
|
||||
#define GGML_CUBLAS_NAME "hipBLAS"
|
||||
#else
|
||||
#define GGML_CUDA_NAME "CUDA"
|
||||
#define GGML_CUBLAS_NAME "cuBLAS"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
21
gguf-py/LICENSE
Normal file
21
gguf-py/LICENSE
Normal file
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2023 Georgi Gerganov
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
55
gguf-py/README.md
Normal file
55
gguf-py/README.md
Normal file
|
@ -0,0 +1,55 @@
|
|||
## gguf
|
||||
|
||||
This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
|
||||
(GGML Universal File) format.
|
||||
|
||||
See [convert-llama-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert-llama-hf-to-gguf.py)
|
||||
as an example for its usage.
|
||||
|
||||
## Installation
|
||||
```sh
|
||||
pip install gguf
|
||||
```
|
||||
|
||||
## Development
|
||||
Maintainers who participate in development of this package are advised to install it in editable mode:
|
||||
|
||||
```sh
|
||||
cd /path/to/llama.cpp/gguf-py
|
||||
|
||||
pip install --editable .
|
||||
```
|
||||
|
||||
**Note**: This may require to upgrade your Pip installation, with a message saying that editable installation currently requires `setup.py`.
|
||||
In this case, upgrade Pip to the latest:
|
||||
|
||||
```sh
|
||||
pip install --upgrade pip
|
||||
```
|
||||
|
||||
## Publishing
|
||||
To publish the package, you need to have `twine` and `build` installed:
|
||||
|
||||
```sh
|
||||
pip install build twine
|
||||
```
|
||||
|
||||
Then, folow these steps to release a new version:
|
||||
|
||||
1. Update the version in `pyproject.toml`.
|
||||
2. Build the package:
|
||||
|
||||
```sh
|
||||
python -m build
|
||||
```
|
||||
|
||||
3. Upload the generated distribution archives:
|
||||
|
||||
```sh
|
||||
python -m twine upload dist/*
|
||||
```
|
||||
|
||||
## TODO
|
||||
- [ ] Add tests
|
||||
- [ ] Include conversion scripts as command line entry points in this package.
|
||||
- Add CI workflow for releasing the package.
|
1
gguf-py/gguf/__init__.py
Normal file
1
gguf-py/gguf/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
from .gguf import *
|
0
gguf.py → gguf-py/gguf/gguf.py
Executable file → Normal file
0
gguf.py → gguf-py/gguf/gguf.py
Executable file → Normal file
28
gguf-py/pyproject.toml
Normal file
28
gguf-py/pyproject.toml
Normal file
|
@ -0,0 +1,28 @@
|
|||
[tool.poetry]
|
||||
name = "gguf"
|
||||
version = "0.2.1"
|
||||
description = "Write ML models in GGUF for GGML"
|
||||
authors = ["GGML <ggml@ggml.ai>"]
|
||||
packages = [
|
||||
{include = "gguf"},
|
||||
]
|
||||
readme = "README.md"
|
||||
homepage = "https://ggml.ai"
|
||||
repository = "https://github.com/ggerganov/llama.cpp"
|
||||
keywords = ["ggml", "gguf", "llama.cpp"]
|
||||
classifiers = [
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8"
|
||||
numpy = ">=1.17"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^5.2"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
7
gguf-py/tests/test_gguf.py
Normal file
7
gguf-py/tests/test_gguf.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
import gguf
|
||||
|
||||
# TODO: add tests
|
||||
|
||||
|
||||
def test_write_gguf():
|
||||
pass
|
42
llama.cpp
42
llama.cpp
|
@ -1836,7 +1836,7 @@ static void llm_load_tensors(
|
|||
(void) main_gpu;
|
||||
(void) mul_mat_q;
|
||||
#if defined(GGML_USE_CUBLAS)
|
||||
LLAMA_LOG_INFO("%s: using CUDA for GPU acceleration\n", __func__);
|
||||
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
||||
ggml_cuda_set_main_device(main_gpu);
|
||||
ggml_cuda_set_mul_mat_q(mul_mat_q);
|
||||
#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
|
||||
|
@ -1958,6 +1958,14 @@ static void llm_load_tensors(
|
|||
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
||||
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
||||
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
||||
|
||||
if (backend_norm == GGML_BACKEND_GPU) {
|
||||
vram_weights += ggml_nbytes(model.output_norm);
|
||||
vram_weights += ggml_nbytes(model.output_norm_b);
|
||||
}
|
||||
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
||||
vram_weights += ggml_nbytes(model.output);
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t n_ff = hparams.n_ff;
|
||||
|
@ -1967,7 +1975,7 @@ static void llm_load_tensors(
|
|||
model.layers.resize(n_layer);
|
||||
|
||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
|
||||
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
|
||||
|
||||
auto & layer = model.layers[i];
|
||||
|
@ -1978,6 +1986,11 @@ static void llm_load_tensors(
|
|||
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
||||
layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
|
||||
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights += ggml_nbytes(layer.attn_norm_2);
|
||||
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
|
||||
}
|
||||
}
|
||||
|
||||
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
||||
|
@ -1985,6 +1998,13 @@ static void llm_load_tensors(
|
|||
|
||||
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
||||
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
||||
|
||||
if (backend == GGML_BACKEND_GPU) {
|
||||
vram_weights +=
|
||||
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
|
||||
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
|
||||
ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
|
@ -5277,13 +5297,29 @@ int llama_model_n_embd(const struct llama_model * model) {
|
|||
return model->hparams.n_embd;
|
||||
}
|
||||
|
||||
int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||
int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
||||
return snprintf(buf, buf_size, "%s %s %s",
|
||||
model->name.c_str(),
|
||||
llama_model_type_name(model->type),
|
||||
llama_model_ftype_name(model->ftype).c_str());
|
||||
}
|
||||
|
||||
uint64_t llama_model_size(const struct llama_model * model) {
|
||||
uint64_t size = 0;
|
||||
for (const auto & it : model->tensors_by_name) {
|
||||
size += ggml_nbytes(it.second);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
uint64_t llama_model_n_params(const struct llama_model * model) {
|
||||
uint64_t nparams = 0;
|
||||
for (const auto & it : model->tensors_by_name) {
|
||||
nparams += ggml_nelements(it.second);
|
||||
}
|
||||
return nparams;
|
||||
}
|
||||
|
||||
int llama_model_quantize(
|
||||
const char * fname_inp,
|
||||
const char * fname_out,
|
||||
|
|
8
llama.h
8
llama.h
|
@ -254,7 +254,11 @@ extern "C" {
|
|||
LLAMA_API int llama_model_n_embd (const struct llama_model * model);
|
||||
|
||||
// Get a string describing the model type
|
||||
LLAMA_API int llama_model_type(const struct llama_model * model, char * buf, size_t buf_size);
|
||||
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
||||
// Returns the total size of all the tensors in the model in bytes
|
||||
LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
|
||||
// Returns the total number of parameters in the model
|
||||
LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
|
||||
|
||||
// Returns 0 on success
|
||||
LLAMA_API int llama_model_quantize(
|
||||
|
@ -348,7 +352,7 @@ extern "C" {
|
|||
|
||||
LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token);
|
||||
|
||||
LLAMA_API llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
||||
LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token);
|
||||
|
||||
// Special tokens
|
||||
LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
numpy==1.24
|
||||
sentencepiece==0.1.98
|
||||
gguf>=0.1.0
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue