Merge 'origin/master' into hipblas
This commit is contained in:
commit
80e4e548bf
42 changed files with 4133 additions and 3539 deletions
33
.devops/full-cuda.Dockerfile
Normal file
33
.devops/full-cuda.Dockerfile
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential python3 python3-pip
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable cuBLAS
|
||||||
|
ENV LLAMA_CUBLAS=1
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
32
.devops/main-cuda.Dockerfile
Normal file
32
.devops/main-cuda.Dockerfile
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=11.7.1
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
# Target the CUDA runtime image
|
||||||
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable cuBLAS
|
||||||
|
ENV LLAMA_CUBLAS=1
|
||||||
|
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
|
COPY --from=build /app/main /main
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/main" ]
|
17
.github/workflows/build.yml
vendored
17
.github/workflows/build.yml
vendored
|
@ -16,7 +16,10 @@ on:
|
||||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
|
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
|
||||||
|
|
||||||
env:
|
env:
|
||||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
|
GGML_NLOOP: 3
|
||||||
|
GGML_NITER: 1
|
||||||
|
GGML_N_THREADS: 1
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
ubuntu-focal-make:
|
ubuntu-focal-make:
|
||||||
|
@ -64,7 +67,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose
|
ctest --verbose --timeout 900
|
||||||
|
|
||||||
ubuntu-latest-cmake-sanitizer:
|
ubuntu-latest-cmake-sanitizer:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
@ -99,7 +102,7 @@ jobs:
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose
|
ctest --verbose --timeout 900
|
||||||
|
|
||||||
macOS-latest-make:
|
macOS-latest-make:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
@ -137,19 +140,21 @@ jobs:
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_AVX2=OFF ..
|
cmake -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF ..
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: cmake_test
|
id: cmake_test
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest --verbose
|
ctest --verbose --timeout 900
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
|
||||||
env:
|
env:
|
||||||
OPENBLAS_VERSION: 0.3.23
|
OPENBLAS_VERSION: 0.3.23
|
||||||
OPENCL_VERSION: 2023.04.17
|
OPENCL_VERSION: 2023.04.17
|
||||||
|
@ -248,7 +253,7 @@ jobs:
|
||||||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
|
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
|
||||||
run: |
|
run: |
|
||||||
cd build
|
cd build
|
||||||
ctest -C Release --verbose
|
ctest -C Release --verbose --timeout 900
|
||||||
|
|
||||||
- name: Get commit hash
|
- name: Get commit hash
|
||||||
id: commit
|
id: commit
|
||||||
|
|
|
@ -68,8 +68,9 @@ option(LLAMA_ACCELERATE "llama: enable Accelerate framework
|
||||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
||||||
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||||
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
||||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||||
|
@ -217,6 +218,9 @@ if (LLAMA_BLAS)
|
||||||
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
|
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
|
||||||
add_compile_options(${BLAS_LINKER_FLAGS})
|
add_compile_options(${BLAS_LINKER_FLAGS})
|
||||||
add_compile_definitions(GGML_USE_OPENBLAS)
|
add_compile_definitions(GGML_USE_OPENBLAS)
|
||||||
|
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
|
||||||
|
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||||
|
endif()
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||||
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
|
||||||
|
|
||||||
|
@ -247,8 +251,14 @@ if (LLAMA_CUBLAS)
|
||||||
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_CUBLAS)
|
||||||
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||||
|
endif()
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
|
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
||||||
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
|
||||||
|
endif()
|
||||||
if (LLAMA_CUDA_DMMV_F16)
|
if (LLAMA_CUDA_DMMV_F16)
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_F16)
|
add_compile_definitions(GGML_CUDA_DMMV_F16)
|
||||||
endif()
|
endif()
|
||||||
|
@ -264,7 +274,7 @@ if (LLAMA_CUBLAS)
|
||||||
if (LLAMA_CUDA_DMMV_F16)
|
if (LLAMA_CUDA_DMMV_F16)
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
|
||||||
else()
|
else()
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
|
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
|
21
Makefile
21
Makefile
|
@ -163,17 +163,27 @@ ifdef LLAMA_CUBLAS
|
||||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
NVCC = nvcc
|
NVCC = nvcc
|
||||||
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
|
NVCCFLAGS = --forward-unknown-to-host-compiler
|
||||||
|
ifdef CUDA_DOCKER_ARCH
|
||||||
|
NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH)
|
||||||
|
else
|
||||||
|
NVCCFLAGS += -arch=native
|
||||||
|
endif # CUDA_DOCKER_ARCH
|
||||||
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||||
|
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||||
|
endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
ifdef LLAMA_CUDA_DMMV_X
|
ifdef LLAMA_CUDA_DMMV_X
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
|
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
|
||||||
endif # LLAMA_CUDA_DMMV_X
|
endif # LLAMA_CUDA_DMMV_X
|
||||||
ifdef LLAMA_CUDA_DMMV_Y
|
ifdef LLAMA_CUDA_MMV_Y
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
|
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||||
|
else ifdef LLAMA_CUDA_DMMV_Y
|
||||||
|
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
|
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
|
||||||
endif # LLAMA_CUDA_DMMV_Y
|
endif # LLAMA_CUDA_MMV_Y
|
||||||
ifdef LLAMA_CUDA_DMMV_F16
|
ifdef LLAMA_CUDA_DMMV_F16
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
|
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
|
||||||
endif # LLAMA_CUDA_DMMV_F16
|
endif # LLAMA_CUDA_DMMV_F16
|
||||||
|
@ -182,6 +192,7 @@ ifdef LLAMA_CUDA_KQUANTS_ITER
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
endif # LLAMA_CUBLAS
|
endif # LLAMA_CUBLAS
|
||||||
|
|
47
README.md
47
README.md
|
@ -11,6 +11,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
**Hot topics:**
|
**Hot topics:**
|
||||||
|
|
||||||
|
- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
|
||||||
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
|
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
|
||||||
- New roadmap: https://github.com/users/ggerganov/projects/7
|
- New roadmap: https://github.com/users/ggerganov/projects/7
|
||||||
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
|
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
|
||||||
|
@ -85,7 +86,7 @@ as the main playground for developing new features for the [ggml](https://github
|
||||||
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
|
- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
|
||||||
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
|
- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)
|
||||||
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
|
- [X] [WizardLM](https://github.com/nlpxucan/WizardLM)
|
||||||
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B)
|
- [X] [Baichuan-7B](https://huggingface.co/baichuan-inc/baichuan-7B) and its derivations (such as [baichuan-7b-sft](https://huggingface.co/hiyouga/baichuan-7b-sft))
|
||||||
|
|
||||||
**Bindings:**
|
**Bindings:**
|
||||||
|
|
||||||
|
@ -344,8 +345,9 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|-------------------------|------------------------|---------|-------------|
|
|-------------------------|------------------------|---------|-------------|
|
||||||
|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 7.0/Turing/RTX 2000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
|
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
|
||||||
|
@ -693,7 +695,7 @@ export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
|
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
|
||||||
|
|
||||||
Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
|
Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
|
||||||
|
|
||||||
### Docker
|
### Docker
|
||||||
|
|
||||||
|
@ -729,6 +731,38 @@ or with a light image:
|
||||||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Docker With CUDA
|
||||||
|
|
||||||
|
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
|
||||||
|
|
||||||
|
#### Building Locally
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||||
|
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
|
||||||
|
```
|
||||||
|
|
||||||
|
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
||||||
|
|
||||||
|
The defaults are:
|
||||||
|
|
||||||
|
- `CUDA_VERSION` set to `11.7.1`
|
||||||
|
- `CUDA_DOCKER_ARCH` set to `all`
|
||||||
|
|
||||||
|
The resulting images, are essentially the same as the non-CUDA images:
|
||||||
|
|
||||||
|
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
|
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
|
||||||
|
|
||||||
|
#### Usage
|
||||||
|
|
||||||
|
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
|
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
|
```
|
||||||
|
|
||||||
### Contributing
|
### Contributing
|
||||||
|
|
||||||
- Contributors can open PRs
|
- Contributors can open PRs
|
||||||
|
@ -749,5 +783,10 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /mode
|
||||||
|
|
||||||
### Docs
|
### Docs
|
||||||
|
|
||||||
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
- [main](./examples/main/README.md)
|
||||||
|
- [server](./examples/server/README.md)
|
||||||
|
- [embd-input](./examples/embd-input/README.md)
|
||||||
|
- [jeopardy](./examples/jeopardy/README.md)
|
||||||
|
- [BLIS](./docs/BLIS.md)
|
||||||
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
|
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
|
||||||
|
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
|
||||||
|
|
|
@ -154,9 +154,15 @@ class Params:
|
||||||
# try transformer naming first
|
# try transformer naming first
|
||||||
if "model.layers.0.self_attn.q_proj.weight" in model:
|
if "model.layers.0.self_attn.q_proj.weight" in model:
|
||||||
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.q_proj.weight" not in model)
|
||||||
|
elif "model.layers.0.self_attn.W_pack.weight" in model: # next: try baichuan naming
|
||||||
|
n_layer=next(i for i in itertools.count() if f"model.layers.{i}.self_attn.W_pack.weight" not in model)
|
||||||
else:
|
else:
|
||||||
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
n_layer=next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model)
|
||||||
|
|
||||||
|
if n_layer < 1:
|
||||||
|
raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n"
|
||||||
|
"Suggestion: provide 'config.json' of the model in the same directory containing model files.")
|
||||||
|
|
||||||
n_head=n_embd // 128 # guessed
|
n_head=n_embd // 128 # guessed
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
|
@ -822,6 +828,7 @@ def lazy_load_torch_file(outer_fp: IO[bytes], path: Path) -> ModelPlus:
|
||||||
|
|
||||||
|
|
||||||
SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
|
SAFETENSORS_DATA_TYPES: Dict[str, DataType] = {
|
||||||
|
'BF16': DT_BF16,
|
||||||
'F16': DT_F16,
|
'F16': DT_F16,
|
||||||
'F32': DT_F32,
|
'F32': DT_F32,
|
||||||
'I32': DT_I32,
|
'I32': DT_I32,
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
cd `dirname $0`
|
cd `dirname $0`
|
||||||
cd ..
|
cd ..
|
||||||
|
|
||||||
./main -m ./models/ggml-alpaca-7b-q4.bin \
|
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
|
||||||
--color \
|
--color \
|
||||||
-f ./prompts/alpaca.txt \
|
-f ./prompts/alpaca.txt \
|
||||||
--ctx_size 2048 \
|
--ctx_size 2048 \
|
||||||
|
|
|
@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) {
|
||||||
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
|
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
|
||||||
|
if (plan.work_size > 0) {
|
||||||
|
buf.resize(plan.work_size);
|
||||||
|
plan.work_data = buf.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_graph_compute(graph, &plan);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * randomize_tensor(
|
struct ggml_tensor * randomize_tensor(
|
||||||
struct ggml_tensor * tensor,
|
struct ggml_tensor * tensor,
|
||||||
int ndims,
|
int ndims,
|
||||||
|
@ -1569,6 +1580,8 @@ int main(int argc, char ** argv) {
|
||||||
int n_tokens = model.hparams.n_ctx;
|
int n_tokens = model.hparams.n_ctx;
|
||||||
int n_vocab = model.hparams.n_vocab;
|
int n_vocab = model.hparams.n_vocab;
|
||||||
|
|
||||||
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
for (int ex=0; ex<n_examples; ++ex) {
|
for (int ex=0; ex<n_examples; ++ex) {
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ compute_size,
|
/*.mem_size =*/ compute_size,
|
||||||
|
@ -1586,7 +1599,6 @@ int main(int argc, char ** argv) {
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
ggml_cgraph gf = {};
|
ggml_cgraph gf = {};
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
|
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
|
||||||
|
|
||||||
|
@ -1595,7 +1607,7 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
|
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, e);
|
ggml_build_forward_expand(&gf, e);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
float error_before_opt = ggml_get_f32_1d(e, 0);
|
float error_before_opt = ggml_get_f32_1d(e, 0);
|
||||||
|
|
||||||
|
@ -1611,7 +1623,7 @@ int main(int argc, char ** argv) {
|
||||||
ggml_opt(ctx0, opt_params_lbfgs, e);
|
ggml_opt(ctx0, opt_params_lbfgs, e);
|
||||||
//
|
//
|
||||||
ggml_build_forward_expand(&gf, e);
|
ggml_build_forward_expand(&gf, e);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
float error_after_opt = ggml_get_f32_1d(e, 0);
|
float error_after_opt = ggml_get_f32_1d(e, 0);
|
||||||
|
|
||||||
|
@ -1659,13 +1671,12 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
ggml_cgraph gf = {};
|
ggml_cgraph gf = {};
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, logits);
|
ggml_build_forward_expand(&gf, logits);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||||
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
||||||
|
@ -1687,10 +1698,11 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
print_matrix(model.tok_embeddings);
|
print_matrix(model.tok_embeddings);
|
||||||
|
|
||||||
printf("done\n");
|
printf("done\n");
|
||||||
|
|
||||||
// ggml_free(kv_self.ctx);
|
// ggml_free(kv_self.ctx);
|
||||||
// ggml_free(model_lora.ctx);
|
// ggml_free(model_lora.ctx);
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,17 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
|
||||||
|
if (plan.work_size > 0) {
|
||||||
|
buf.resize(plan.work_size);
|
||||||
|
plan.work_data = buf.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_graph_compute(graph, &plan);
|
||||||
|
}
|
||||||
|
|
||||||
float tensor_sum_elements(const ggml_tensor * tensor) {
|
float tensor_sum_elements(const ggml_tensor * tensor) {
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
if (tensor->type==GGML_TYPE_F32) {
|
if (tensor->type==GGML_TYPE_F32) {
|
||||||
|
@ -159,13 +170,14 @@ int main(int argc, char ** argv) {
|
||||||
// printf("Creating compute graph\n");
|
// printf("Creating compute graph\n");
|
||||||
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
|
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
|
||||||
|
|
||||||
gf.n_threads=benchmark_params.n_threads;
|
printf("n_threads=%i\n", benchmark_params.n_threads);
|
||||||
printf("cgraph->n_threads=%i\n",gf.n_threads);
|
|
||||||
|
|
||||||
TENSOR_DUMP(m11);
|
TENSOR_DUMP(m11);
|
||||||
TENSOR_DUMP(m2);
|
TENSOR_DUMP(m2);
|
||||||
|
|
||||||
ggml_graph_compute(ctx, &gf);
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
|
ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
|
||||||
|
|
||||||
TENSOR_DUMP(gf.nodes[0]);
|
TENSOR_DUMP(gf.nodes[0]);
|
||||||
|
|
||||||
|
@ -187,7 +199,6 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// printf("Creating compute graph\n");
|
// printf("Creating compute graph\n");
|
||||||
struct ggml_cgraph gf31 = ggml_build_forward(q31);
|
struct ggml_cgraph gf31 = ggml_build_forward(q31);
|
||||||
gf31.n_threads=benchmark_params.n_threads;
|
|
||||||
|
|
||||||
// Set up a second graph computation to make sure we override the CPU cache lines
|
// Set up a second graph computation to make sure we override the CPU cache lines
|
||||||
// printf("Creating new tensor q12 & Running quantize\n");
|
// printf("Creating new tensor q12 & Running quantize\n");
|
||||||
|
@ -199,8 +210,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
//printf("Creating compute graph\n");
|
//printf("Creating compute graph\n");
|
||||||
struct ggml_cgraph gf32 = ggml_build_forward(q32);
|
struct ggml_cgraph gf32 = ggml_build_forward(q32);
|
||||||
gf32.n_threads=benchmark_params.n_threads;
|
printf("n_threads=%i\n", benchmark_params.n_threads);
|
||||||
printf("cgraph->n_threads=%i\n",gf31.n_threads);
|
|
||||||
|
|
||||||
const int dimx = sizex;
|
const int dimx = sizex;
|
||||||
const int dimy = sizey;
|
const int dimy = sizey;
|
||||||
|
@ -221,14 +231,15 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
long long int start = ggml_time_us();
|
long long int start = ggml_time_us();
|
||||||
//printf("Running ggml_graph_compute\n");
|
//printf("Running ggml_graph_compute\n");
|
||||||
ggml_graph_compute(ctx, &gf31);
|
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
|
||||||
|
|
||||||
long long int stop = ggml_time_us();
|
long long int stop = ggml_time_us();
|
||||||
long long int usec = stop-start;
|
long long int usec = stop-start;
|
||||||
double gflops = (double)(flops_per_matrix)/usec/1000.0;
|
double gflops = (double)(flops_per_matrix)/usec/1000.0;
|
||||||
gflops_sum += gflops;
|
gflops_sum += gflops;
|
||||||
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
|
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
|
||||||
i,
|
i,
|
||||||
gf31.n_threads,
|
benchmark_params.n_threads,
|
||||||
sizex, sizey, sizez, flops_per_matrix,
|
sizex, sizey, sizez, flops_per_matrix,
|
||||||
usec,gflops);
|
usec,gflops);
|
||||||
|
|
||||||
|
@ -253,7 +264,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Running a different graph computation to make sure we override the CPU cache lines
|
// Running a different graph computation to make sure we override the CPU cache lines
|
||||||
ggml_graph_compute(ctx, &gf32);
|
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
||||||
|
|
|
@ -418,6 +418,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
|
|
||||||
if (escape_prompt) {
|
if (escape_prompt) {
|
||||||
process_escapes(params.prompt);
|
process_escapes(params.prompt);
|
||||||
|
process_escapes(params.input_prefix);
|
||||||
|
process_escapes(params.input_suffix);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -29,7 +29,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
|
||||||
|
|
||||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||||
|
|
||||||
if (params.seed < 0) {
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
||||||
|
|
|
@ -18,7 +18,7 @@ int main(int argc, char ** argv) {
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
"expect poor results\n", __func__, params.n_ctx);
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -85,7 +85,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
"expect poor results\n", __func__, params.n_ctx);
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
} else if (params.n_ctx < 8) {
|
} else if (params.n_ctx < 8) {
|
||||||
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
|
||||||
|
|
|
@ -35,10 +35,9 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_context * ctx_eval = NULL;
|
struct ggml_context * ctx_eval = NULL;
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
auto * ctx_metal = ggml_metal_init();
|
auto * ctx_metal = ggml_metal_init(1);
|
||||||
|
|
||||||
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
|
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
|
||||||
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
|
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);
|
||||||
|
|
|
@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
|
||||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
if (params.n_ctx > 2048) {
|
if (params.n_ctx > 2048) {
|
||||||
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
fprintf(stderr, "%s: warning: model might not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
"expect poor results\n", __func__, params.n_ctx);
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -147,7 +147,7 @@ void test_roundtrip_on_chunk(
|
||||||
const ggml_tensor * layer,
|
const ggml_tensor * layer,
|
||||||
int64_t offset,
|
int64_t offset,
|
||||||
int64_t chunk_size,
|
int64_t chunk_size,
|
||||||
const quantize_fns_t & qfns,
|
const ggml_type_traits_t & qfns,
|
||||||
bool use_reference,
|
bool use_reference,
|
||||||
float * input_scratch,
|
float * input_scratch,
|
||||||
char * quantized_scratch,
|
char * quantized_scratch,
|
||||||
|
@ -163,11 +163,11 @@ void test_roundtrip_on_chunk(
|
||||||
}
|
}
|
||||||
|
|
||||||
if (use_reference) {
|
if (use_reference) {
|
||||||
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
|
qfns.from_float_reference(input_scratch, quantized_scratch, chunk_size);
|
||||||
} else {
|
} else {
|
||||||
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
|
qfns.from_float(input_scratch, quantized_scratch, chunk_size);
|
||||||
}
|
}
|
||||||
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);
|
qfns.to_float(quantized_scratch, output_scratch, chunk_size);
|
||||||
|
|
||||||
update_error_stats(chunk_size, input_scratch, output_scratch, stats);
|
update_error_stats(chunk_size, input_scratch, output_scratch, stats);
|
||||||
}
|
}
|
||||||
|
@ -177,7 +177,7 @@ void test_roundtrip_on_chunk(
|
||||||
void test_roundtrip_on_layer(
|
void test_roundtrip_on_layer(
|
||||||
std::string & name,
|
std::string & name,
|
||||||
bool print_layer_stats,
|
bool print_layer_stats,
|
||||||
const quantize_fns_t & qfns,
|
const ggml_type_traits_t & qfns,
|
||||||
bool use_reference,
|
bool use_reference,
|
||||||
const ggml_tensor * layer,
|
const ggml_tensor * layer,
|
||||||
std::vector<float> & input_scratch,
|
std::vector<float> & input_scratch,
|
||||||
|
@ -388,8 +388,8 @@ int main(int argc, char ** argv) {
|
||||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
||||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
if (qfns.from_float && qfns.to_float) {
|
||||||
if (params.verbose) {
|
if (params.verbose) {
|
||||||
printf("testing %s ...\n", ggml_type_name(type));
|
printf("testing %s ...\n", ggml_type_name(type));
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
# llama.cpp/example/server
|
# llama.cpp/example/server
|
||||||
|
|
||||||
This example demonstrates a simple HTTP API server to interact with llama.cpp.
|
This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp.
|
||||||
|
|
||||||
Command line options:
|
Command line options:
|
||||||
|
|
||||||
- `--threads N`, `-t N`: Set the number of threads to use during computation.
|
- `--threads N`, `-t N`: Set the number of threads to use during computation.
|
||||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||||
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||||
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
|
||||||
|
@ -21,24 +21,22 @@ Command line options:
|
||||||
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
|
||||||
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
|
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
|
||||||
- `--port`: Set the port to listen. Default: `8080`.
|
- `--port`: Set the port to listen. Default: `8080`.
|
||||||
|
- `--path`: path from which to serve static files (default examples/server/public)
|
||||||
- `--embedding`: Enable embedding extraction, Default: disabled.
|
- `--embedding`: Enable embedding extraction, Default: disabled.
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
Build llama.cpp with server from repository root with either make or CMake.
|
server is build alongside everything else from the root of the project
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_BUILD_SERVER=1 make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
mkdir build-server
|
|
||||||
cd build-server
|
|
||||||
cmake -DLLAMA_BUILD_SERVER=ON ..
|
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -59,7 +57,7 @@ server.exe -m models\7B\ggml-model.bin -c 2048
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
||||||
You can consume the endpoints with Postman or NodeJS with axios library.
|
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
|
||||||
|
|
||||||
## Testing with CURL
|
## Testing with CURL
|
||||||
|
|
||||||
|
@ -190,3 +188,49 @@ Run with bash:
|
||||||
```sh
|
```sh
|
||||||
bash chat.sh
|
bash chat.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### API like OAI
|
||||||
|
|
||||||
|
API example using Python Flask: [api_like_OAI.py](api_like_OAI.py)
|
||||||
|
This example must be used with server.cpp
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python api_like_OAI.py
|
||||||
|
```
|
||||||
|
|
||||||
|
After running the API server, you can use it in Python by setting the API base URL.
|
||||||
|
```python
|
||||||
|
openai.api_base = "http://<Your api-server IP>:port"
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
|
||||||
|
|
||||||
|
### Extending or building alternative Web Front End
|
||||||
|
|
||||||
|
The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method.
|
||||||
|
|
||||||
|
Read the documentation in `/completion.js` to see convenient ways to access llama.
|
||||||
|
|
||||||
|
A simple example is below:
|
||||||
|
|
||||||
|
```html
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<pre>
|
||||||
|
<script type="module">
|
||||||
|
import { llama } from '/completion.js'
|
||||||
|
|
||||||
|
const prompt = `### Instruction:
|
||||||
|
Write dad jokes, each one paragraph.
|
||||||
|
You can use html formatting if needed.
|
||||||
|
|
||||||
|
### Response:`
|
||||||
|
|
||||||
|
for await (const chunk of llama(prompt)) {
|
||||||
|
document.write(chunk.data.content)
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
</pre>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
```
|
||||||
|
|
219
examples/server/api_like_OAI.py
Executable file
219
examples/server/api_like_OAI.py
Executable file
|
@ -0,0 +1,219 @@
|
||||||
|
import argparse
|
||||||
|
from flask import Flask, jsonify, request, Response
|
||||||
|
import urllib.parse
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
|
||||||
|
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
|
||||||
|
parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ")
|
||||||
|
parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ")
|
||||||
|
parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ")
|
||||||
|
parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
|
||||||
|
parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
|
||||||
|
parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
|
||||||
|
parser.add_argument("--host", type=str, help="Set the ip address to listen.(default: 127.0.0.1)", default='127.0.0.1')
|
||||||
|
parser.add_argument("--port", type=int, help="Set the port to listen.(default: 8081)", default=8081)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
def is_present(json, key):
|
||||||
|
try:
|
||||||
|
buf = json[key]
|
||||||
|
except KeyError:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#convert chat to prompt
|
||||||
|
def convert_chat(messages):
|
||||||
|
prompt = "" + args.chat_prompt.replace("\\n", "\n")
|
||||||
|
|
||||||
|
system_n = args.system_name.replace("\\n", "\n")
|
||||||
|
user_n = args.user_name.replace("\\n", "\n")
|
||||||
|
ai_n = args.ai_name.replace("\\n", "\n")
|
||||||
|
stop = args.stop.replace("\\n", "\n")
|
||||||
|
|
||||||
|
|
||||||
|
for line in messages:
|
||||||
|
if (line["role"] == "system"):
|
||||||
|
prompt += f"{system_n}{line['content']}"
|
||||||
|
if (line["role"] == "user"):
|
||||||
|
prompt += f"{user_n}{line['content']}"
|
||||||
|
if (line["role"] == "assistant"):
|
||||||
|
prompt += f"{ai_n}{line['content']}{stop}"
|
||||||
|
prompt += ai_n.rstrip()
|
||||||
|
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
def make_postData(body, chat=False, stream=False):
|
||||||
|
postData = {}
|
||||||
|
if (chat):
|
||||||
|
postData["prompt"] = convert_chat(body["messages"])
|
||||||
|
else:
|
||||||
|
postData["prompt"] = body["prompt"]
|
||||||
|
if(is_present(body, "temperature")): postData["temperature"] = body["temperature"]
|
||||||
|
if(is_present(body, "top_k")): postData["top_k"] = body["top_k"]
|
||||||
|
if(is_present(body, "top_p")): postData["top_p"] = body["top_p"]
|
||||||
|
if(is_present(body, "max_tokens")): postData["n_predict"] = body["max_tokens"]
|
||||||
|
if(is_present(body, "presence_penalty")): postData["presence_penalty"] = body["presence_penalty"]
|
||||||
|
if(is_present(body, "frequency_penalty")): postData["frequency_penalty"] = body["frequency_penalty"]
|
||||||
|
if(is_present(body, "repeat_penalty")): postData["repeat_penalty"] = body["repeat_penalty"]
|
||||||
|
if(is_present(body, "mirostat")): postData["mirostat"] = body["mirostat"]
|
||||||
|
if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
|
||||||
|
if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
|
||||||
|
if(is_present(body, "seed")): postData["seed"] = body["seed"]
|
||||||
|
if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
|
||||||
|
if (args.stop != ""):
|
||||||
|
postData["stop"] = [args.stop]
|
||||||
|
else:
|
||||||
|
postData["stop"] = []
|
||||||
|
if(is_present(body, "stop")): postData["stop"] += body["stop"]
|
||||||
|
postData["n_keep"] = -1
|
||||||
|
postData["stream"] = stream
|
||||||
|
|
||||||
|
return postData
|
||||||
|
|
||||||
|
def make_resData(data, chat=False, promptToken=[]):
|
||||||
|
resData = {
|
||||||
|
"id": "chatcmpl" if (chat) else "cmpl",
|
||||||
|
"object": "chat.completion" if (chat) else "text_completion",
|
||||||
|
"created": int(time.time()),
|
||||||
|
"truncated": data["truncated"],
|
||||||
|
"model": "LLaMA_CPP",
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": data["tokens_evaluated"],
|
||||||
|
"completion_tokens": data["tokens_predicted"],
|
||||||
|
"total_tokens": data["tokens_evaluated"] + data["tokens_predicted"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (len(promptToken) != 0):
|
||||||
|
resData["promptToken"] = promptToken
|
||||||
|
if (chat):
|
||||||
|
#only one choice is supported
|
||||||
|
resData["choices"] = [{
|
||||||
|
"index": 0,
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": data["content"],
|
||||||
|
},
|
||||||
|
"finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
|
||||||
|
}]
|
||||||
|
else:
|
||||||
|
#only one choice is supported
|
||||||
|
resData["choices"] = [{
|
||||||
|
"text": data["content"],
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": None,
|
||||||
|
"finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
|
||||||
|
}]
|
||||||
|
return resData
|
||||||
|
|
||||||
|
def make_resData_stream(data, chat=False, time_now = 0, start=False):
|
||||||
|
resData = {
|
||||||
|
"id": "chatcmpl" if (chat) else "cmpl",
|
||||||
|
"object": "chat.completion.chunk" if (chat) else "text_completion.chunk",
|
||||||
|
"created": time_now,
|
||||||
|
"model": "LLaMA_CPP",
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"finish_reason": None,
|
||||||
|
"index": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
if (chat):
|
||||||
|
if (start):
|
||||||
|
resData["choices"][0]["delta"] = {
|
||||||
|
"role": "assistant"
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
resData["choices"][0]["delta"] = {
|
||||||
|
"content": data["content"]
|
||||||
|
}
|
||||||
|
if (data["stop"]):
|
||||||
|
resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
|
||||||
|
else:
|
||||||
|
resData["choices"][0]["text"] = data["content"]
|
||||||
|
if (data["stop"]):
|
||||||
|
resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
|
||||||
|
|
||||||
|
return resData
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/chat/completions', methods=['POST'])
|
||||||
|
@app.route('/v1/chat/completions', methods=['POST'])
|
||||||
|
def chat_completions():
|
||||||
|
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
||||||
|
return Response(status=403)
|
||||||
|
body = request.get_json()
|
||||||
|
stream = False
|
||||||
|
tokenize = False
|
||||||
|
if(is_present(body, "stream")): stream = body["stream"]
|
||||||
|
if(is_present(body, "tokenize")): tokenize = body["tokenize"]
|
||||||
|
postData = make_postData(body, chat=True, stream=stream)
|
||||||
|
|
||||||
|
promptToken = []
|
||||||
|
if (tokenize):
|
||||||
|
tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
|
||||||
|
promptToken = tokenData["tokens"]
|
||||||
|
|
||||||
|
if (not stream):
|
||||||
|
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
|
||||||
|
print(data.json())
|
||||||
|
resData = make_resData(data.json(), chat=True, promptToken=promptToken)
|
||||||
|
return jsonify(resData)
|
||||||
|
else:
|
||||||
|
def generate():
|
||||||
|
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
|
||||||
|
time_now = int(time.time())
|
||||||
|
resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
|
||||||
|
yield 'data: {}\n'.format(json.dumps(resData))
|
||||||
|
for line in data.iter_lines():
|
||||||
|
if line:
|
||||||
|
decoded_line = line.decode('utf-8')
|
||||||
|
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
|
||||||
|
yield 'data: {}\n'.format(json.dumps(resData))
|
||||||
|
return Response(generate(), mimetype='text/event-stream')
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/completions', methods=['POST'])
|
||||||
|
@app.route('/v1/completions', methods=['POST'])
|
||||||
|
def completion():
|
||||||
|
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
||||||
|
return Response(status=403)
|
||||||
|
body = request.get_json()
|
||||||
|
stream = False
|
||||||
|
tokenize = False
|
||||||
|
if(is_present(body, "stream")): stream = body["stream"]
|
||||||
|
if(is_present(body, "tokenize")): tokenize = body["tokenize"]
|
||||||
|
postData = make_postData(body, chat=False, stream=stream)
|
||||||
|
|
||||||
|
promptToken = []
|
||||||
|
if (tokenize):
|
||||||
|
tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
|
||||||
|
promptToken = tokenData["tokens"]
|
||||||
|
|
||||||
|
if (not stream):
|
||||||
|
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
|
||||||
|
print(data.json())
|
||||||
|
resData = make_resData(data.json(), chat=False, promptToken=promptToken)
|
||||||
|
return jsonify(resData)
|
||||||
|
else:
|
||||||
|
def generate():
|
||||||
|
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
|
||||||
|
time_now = int(time.time())
|
||||||
|
for line in data.iter_lines():
|
||||||
|
if line:
|
||||||
|
decoded_line = line.decode('utf-8')
|
||||||
|
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
|
||||||
|
yield 'data: {}\n'.format(json.dumps(resData))
|
||||||
|
return Response(generate(), mimetype='text/event-stream')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(args.host, port=args.port)
|
|
@ -7,187 +7,369 @@ unsigned char completion_js[] = {
|
||||||
0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a,
|
0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a,
|
||||||
0x20, 0x30, 0x2e, 0x32, 0x2c, 0x0a, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70,
|
0x20, 0x30, 0x2e, 0x32, 0x2c, 0x0a, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70,
|
||||||
0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x5d, 0x0a, 0x7d,
|
0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x5d, 0x0a, 0x7d,
|
||||||
0x3b, 0x0a, 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x54, 0x68,
|
0x3b, 0x0a, 0x0a, 0x6c, 0x65, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72,
|
||||||
0x69, 0x73, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20,
|
0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
|
||||||
0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x73, 0x20, 0x74, 0x68,
|
0x67, 0x73, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x0a,
|
||||||
0x65, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74,
|
0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65,
|
||||||
0x20, 0x75, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x61, 0x20, 0x6c, 0x6c, 0x61,
|
0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
|
||||||
0x6d, 0x61, 0x20, 0x64, 0x69, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x72,
|
0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61,
|
||||||
0x79, 0x2e, 0x0a, 0x20, 0x2a, 0x20, 0x40, 0x70, 0x61, 0x72, 0x61, 0x6d,
|
0x74, 0x6f, 0x72, 0x2e, 0x20, 0x52, 0x65, 0x63, 0x6f, 0x6d, 0x6d, 0x65,
|
||||||
0x20, 0x7b, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x7d, 0x20, 0x70, 0x61,
|
0x6e, 0x64, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6d, 0x6f, 0x73,
|
||||||
0x72, 0x61, 0x6d, 0x73, 0x20, 0x2d, 0x20, 0x54, 0x68, 0x65, 0x20, 0x70,
|
0x74, 0x20, 0x75, 0x73, 0x65, 0x20, 0x63, 0x61, 0x73, 0x65, 0x73, 0x2e,
|
||||||
0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x20, 0x66, 0x6f,
|
0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70,
|
||||||
0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
|
0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
|
||||||
0x74, 0x69, 0x6f, 0x6e, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74,
|
0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c,
|
||||||
0x2e, 0x0a, 0x20, 0x2a, 0x20, 0x40, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x20,
|
0x61, 0x6d, 0x61, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27,
|
||||||
0x7b, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x7d, 0x20, 0x63, 0x6f, 0x6e,
|
0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e,
|
||||||
0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x2d, 0x20, 0x61, 0x6e,
|
0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20,
|
||||||
0x20, 0x69, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x20, 0x6f, 0x66,
|
0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
|
||||||
0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
|
0x73, 0x74, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x22,
|
||||||
0x6c, 0x6c, 0x65, 0x72, 0x20, 0x69, 0x66, 0x20, 0x79, 0x6f, 0x75, 0x20,
|
0x54, 0x65, 0x6c, 0x6c, 0x20, 0x6d, 0x65, 0x20, 0x61, 0x20, 0x6a, 0x6f,
|
||||||
0x6e, 0x65, 0x65, 0x64, 0x20, 0x6f, 0x6e, 0x65, 0x2c, 0x20, 0x6f, 0x72,
|
0x6b, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64,
|
||||||
0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x2e, 0x0a, 0x20, 0x2a, 0x20, 0x40, 0x70,
|
0x69, 0x63, 0x74, 0x3a, 0x20, 0x38, 0x30, 0x30, 0x7d, 0x29, 0x0a, 0x2f,
|
||||||
0x61, 0x72, 0x61, 0x6d, 0x20, 0x7b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69,
|
0x2f, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61,
|
||||||
0x6f, 0x6e, 0x7d, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b,
|
0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68,
|
||||||
0x20, 0x2d, 0x20, 0x54, 0x68, 0x65, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
|
0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65,
|
||||||
0x61, 0x63, 0x6b, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e,
|
0x73, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20,
|
||||||
0x20, 0x74, 0x6f, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x20, 0x77, 0x68, 0x65,
|
0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77,
|
||||||
0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
|
0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
|
||||||
0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x6e, 0x65,
|
0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29,
|
||||||
0x2e, 0x0a, 0x20, 0x2a, 0x20, 0x40, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e,
|
0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x2f, 0x2f, 0x0a,
|
||||||
0x73, 0x20, 0x7b, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x7d, 0x20, 0x74,
|
0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63,
|
||||||
0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64,
|
0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x2a, 0x20, 0x6c,
|
||||||
0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x73,
|
0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
|
||||||
0x74, 0x72, 0x69, 0x6e, 0x67, 0x2e, 0x20, 0x49, 0x64, 0x65, 0x61, 0x6c,
|
0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d,
|
||||||
0x6c, 0x79, 0x20, 0x69, 0x67, 0x6e, 0x6f, 0x72, 0x65, 0x64, 0x2c, 0x20,
|
0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b,
|
||||||
0x61, 0x6e, 0x64, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x67, 0x65, 0x74, 0x20,
|
0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
|
||||||
0x61, 0x74, 0x20, 0x69, 0x74, 0x20, 0x76, 0x69, 0x61, 0x20, 0x74, 0x68,
|
0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20,
|
||||||
0x65, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x2e, 0x0a,
|
0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72,
|
||||||
0x20, 0x2a, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63,
|
0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x69, 0x66,
|
||||||
0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f,
|
0x20, 0x28, 0x21, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65,
|
||||||
0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79,
|
0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
|
||||||
0x6e, 0x63, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20,
|
0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65,
|
||||||
0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20,
|
0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72,
|
||||||
0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e,
|
0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d,
|
||||||
0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x63, 0x6f,
|
0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f,
|
||||||
0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a,
|
0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61,
|
||||||
0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c,
|
0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61,
|
||||||
0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, 0x6f,
|
0x72, 0x61, 0x6d, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x2c,
|
||||||
0x72, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72,
|
0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20,
|
||||||
0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x63, 0x6f,
|
0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20,
|
||||||
0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69,
|
0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f,
|
||||||
0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b,
|
0x6e, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
|
||||||
0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44, 0x65, 0x66,
|
0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, 0x63, 0x6f, 0x6d, 0x70,
|
||||||
0x61, 0x75, 0x6c, 0x74, 0x73, 0x2c, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61,
|
0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x2c, 0x20, 0x7b, 0x0a, 0x20,
|
||||||
0x72, 0x61, 0x6d, 0x73, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x2f,
|
0x20, 0x20, 0x20, 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x3a, 0x20, 0x27,
|
||||||
0x2f, 0x20, 0x77, 0x65, 0x20, 0x75, 0x73, 0x65, 0x20, 0x66, 0x65, 0x74,
|
0x50, 0x4f, 0x53, 0x54, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x62,
|
||||||
0x63, 0x68, 0x20, 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x6c, 0x79, 0x20,
|
0x6f, 0x64, 0x79, 0x3a, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74,
|
||||||
0x68, 0x65, 0x72, 0x65, 0x20, 0x62, 0x65, 0x63, 0x61, 0x73, 0x75, 0x65,
|
0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f, 0x6d, 0x70,
|
||||||
0x20, 0x74, 0x68, 0x65, 0x20, 0x62, 0x75, 0x69, 0x6c, 0x74, 0x20, 0x69,
|
0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73,
|
||||||
0x6e, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74,
|
0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65,
|
||||||
0x53, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20,
|
0x72, 0x73, 0x3a, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||||
0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20,
|
0x27, 0x43, 0x6f, 0x6e, 0x6e, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x27,
|
||||||
0x50, 0x4f, 0x53, 0x54, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
|
0x3a, 0x20, 0x27, 0x6b, 0x65, 0x65, 0x70, 0x2d, 0x61, 0x6c, 0x69, 0x76,
|
||||||
0x20, 0x72, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x20, 0x3d, 0x20,
|
0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x43,
|
||||||
0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28,
|
0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, 0x79, 0x70, 0x65, 0x27,
|
||||||
0x22, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
|
0x3a, 0x20, 0x27, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69,
|
||||||
0x22, 0x2c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x74,
|
0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x27, 0x2c, 0x0a, 0x20, 0x20,
|
||||||
0x68, 0x6f, 0x64, 0x3a, 0x20, 0x27, 0x50, 0x4f, 0x53, 0x54, 0x27, 0x2c,
|
0x20, 0x20, 0x20, 0x20, 0x27, 0x41, 0x63, 0x63, 0x65, 0x70, 0x74, 0x27,
|
||||||
0x0a, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x64, 0x79, 0x3a, 0x20, 0x4a,
|
0x3a, 0x20, 0x27, 0x74, 0x65, 0x78, 0x74, 0x2f, 0x65, 0x76, 0x65, 0x6e,
|
||||||
0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66,
|
0x74, 0x2d, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x27, 0x0a, 0x20, 0x20,
|
||||||
0x79, 0x28, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e,
|
0x20, 0x20, 0x7d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x69, 0x67,
|
||||||
0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20,
|
0x6e, 0x61, 0x6c, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c,
|
||||||
0x20, 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x73, 0x3a, 0x20, 0x7b, 0x0a,
|
0x6c, 0x65, 0x72, 0x2e, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a,
|
||||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x43, 0x6f, 0x6e, 0x6e, 0x65,
|
0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e,
|
||||||
0x63, 0x74, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x27, 0x6b, 0x65, 0x65,
|
0x73, 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20,
|
||||||
0x70, 0x2d, 0x61, 0x6c, 0x69, 0x76, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20,
|
0x72, 0x65, 0x73, 0x70, 0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64,
|
||||||
0x20, 0x20, 0x20, 0x20, 0x27, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
|
0x79, 0x2e, 0x67, 0x65, 0x74, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28,
|
||||||
0x2d, 0x54, 0x79, 0x70, 0x65, 0x27, 0x3a, 0x20, 0x27, 0x61, 0x70, 0x70,
|
0x29, 0x3b, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64,
|
||||||
0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f,
|
0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77,
|
||||||
0x6e, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x41,
|
0x20, 0x54, 0x65, 0x78, 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72,
|
||||||
0x63, 0x63, 0x65, 0x70, 0x74, 0x27, 0x3a, 0x20, 0x27, 0x74, 0x65, 0x78,
|
0x28, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63,
|
||||||
0x74, 0x2f, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2d, 0x73, 0x74, 0x72, 0x65,
|
0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b,
|
||||||
0x61, 0x6d, 0x27, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x0a, 0x20,
|
0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20,
|
||||||
0x20, 0x20, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x3a, 0x20, 0x63,
|
0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d,
|
||||||
0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x73, 0x69,
|
0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
|
||||||
0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a,
|
0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29,
|
||||||
0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x61,
|
0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
|
||||||
0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f, 0x6e,
|
0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20,
|
||||||
0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x2e, 0x67, 0x65, 0x74, 0x52,
|
0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72,
|
||||||
0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x63,
|
0x2e, 0x72, 0x65, 0x61, 0x64, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
|
||||||
0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72,
|
0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c,
|
||||||
0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x54, 0x65, 0x78, 0x74, 0x44,
|
0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
|
||||||
0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x0a, 0x20,
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b,
|
||||||
0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
|
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
|
||||||
0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72,
|
0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x73, 0x65, 0x20, 0x61,
|
||||||
0x79, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74,
|
0x6e, 0x73, 0x77, 0x65, 0x72, 0x73, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68,
|
||||||
0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65,
|
0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69,
|
||||||
0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65,
|
0x70, 0x6c, 0x65, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x20, 0x6f, 0x66,
|
||||||
0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
|
0x3a, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x5c, 0x6e, 0x20, 0x77, 0x69,
|
||||||
0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65,
|
0x74, 0x68, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x61, 0x6c, 0x77, 0x61,
|
||||||
0x73, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74,
|
0x79, 0x73, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x74, 0x20, 0x61,
|
||||||
0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64,
|
0x73, 0x20, 0x61, 0x20, 0x6b, 0x65, 0x79, 0x2e, 0x20, 0x69, 0x6e, 0x20,
|
||||||
0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66,
|
0x6f, 0x75, 0x72, 0x20, 0x63, 0x61, 0x73, 0x65, 0x20, 0x77, 0x65, 0x0a,
|
||||||
0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f, 0x6e,
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6d, 0x61, 0x69,
|
||||||
0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
0x6e, 0x6c, 0x79, 0x20, 0x63, 0x61, 0x72, 0x65, 0x20, 0x61, 0x62, 0x6f,
|
||||||
0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
|
0x75, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3a,
|
||||||
0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f,
|
0x20, 0x6b, 0x65, 0x79, 0x20, 0x68, 0x65, 0x72, 0x65, 0x2c, 0x20, 0x77,
|
||||||
0x2f, 0x20, 0x73, 0x73, 0x65, 0x20, 0x61, 0x6e, 0x73, 0x77, 0x65, 0x72,
|
0x68, 0x69, 0x63, 0x68, 0x20, 0x77, 0x65, 0x20, 0x65, 0x78, 0x70, 0x65,
|
||||||
0x73, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x65, 0x20, 0x66, 0x6f, 0x72,
|
0x63, 0x74, 0x20, 0x61, 0x73, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x0a, 0x20,
|
||||||
0x6d, 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x70, 0x6c, 0x65, 0x20, 0x6c,
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74,
|
||||||
0x69, 0x6e, 0x65, 0x73, 0x20, 0x6f, 0x66, 0x3a, 0x20, 0x76, 0x61, 0x6c,
|
0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65,
|
||||||
0x75, 0x65, 0x5c, 0x6e, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x64, 0x61,
|
0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x28, 0x72, 0x65, 0x73,
|
||||||
0x74, 0x61, 0x20, 0x61, 0x6c, 0x77, 0x61, 0x79, 0x73, 0x20, 0x70, 0x72,
|
0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a,
|
||||||
0x65, 0x73, 0x65, 0x6e, 0x74, 0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x6b,
|
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x61,
|
||||||
0x65, 0x79, 0x2e, 0x20, 0x69, 0x6e, 0x20, 0x6f, 0x75, 0x72, 0x20, 0x63,
|
0x72, 0x73, 0x65, 0x20, 0x61, 0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20,
|
||||||
0x61, 0x73, 0x65, 0x20, 0x77, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
|
0x65, 0x76, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61,
|
||||||
0x20, 0x2f, 0x2f, 0x20, 0x6d, 0x61, 0x69, 0x6e, 0x6c, 0x79, 0x20, 0x63,
|
0x64, 0x64, 0x20, 0x74, 0x68, 0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72,
|
||||||
0x61, 0x72, 0x65, 0x20, 0x61, 0x62, 0x6f, 0x75, 0x74, 0x20, 0x74, 0x68,
|
0x65, 0x73, 0x75, 0x6c, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||||
0x65, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, 0x6b, 0x65, 0x79, 0x20,
|
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20,
|
||||||
0x68, 0x65, 0x72, 0x65, 0x2c, 0x20, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20,
|
0x3d, 0x20, 0x2f, 0x5e, 0x28, 0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73,
|
||||||
0x77, 0x65, 0x20, 0x65, 0x78, 0x70, 0x65, 0x63, 0x74, 0x20, 0x61, 0x73,
|
0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20,
|
||||||
0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e,
|
||||||
0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d,
|
0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x6f, 0x66, 0x20,
|
||||||
0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63,
|
0x74, 0x65, 0x78, 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x41, 0x6c,
|
||||||
0x6f, 0x64, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76,
|
0x6c, 0x28, 0x72, 0x65, 0x67, 0x65, 0x78, 0x29, 0x29, 0x20, 0x7b, 0x0a,
|
||||||
0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
|
||||||
0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x61, 0x72, 0x73, 0x65, 0x20, 0x61,
|
0x6c, 0x74, 0x5b, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d,
|
||||||
0x6c, 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
|
0x20, 0x3d, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a,
|
||||||
0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64, 0x20, 0x74, 0x68,
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20,
|
||||||
0x65, 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74,
|
0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x20,
|
||||||
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
|
0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73,
|
||||||
0x20, 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x2f, 0x5e, 0x28,
|
0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70,
|
||||||
0x5c, 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e, 0x2a, 0x29, 0x24,
|
0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a, 0x75, 0x73,
|
||||||
0x2f, 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66,
|
0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74, 0x68, 0x65,
|
||||||
0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61,
|
0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64, 0x61, 0x74,
|
||||||
0x74, 0x63, 0x68, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
|
0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75,
|
||||||
0x6d, 0x61, 0x74, 0x63, 0x68, 0x41, 0x6c, 0x6c, 0x28, 0x72, 0x65, 0x67,
|
0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53,
|
||||||
0x65, 0x78, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
|
0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73,
|
||||||
0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b, 0x6d, 0x61,
|
0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20,
|
||||||
0x74, 0x63, 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20, 0x6d, 0x61,
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74,
|
||||||
0x74, 0x63, 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
|
0x20, 0x2b, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
|
||||||
0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
|
0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b,
|
||||||
0x20, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x20, 0x77, 0x65, 0x20, 0x6b, 0x6e,
|
0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x79,
|
||||||
0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6c,
|
0x69, 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79,
|
||||||
0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x2c, 0x20, 0x6c, 0x65,
|
0x69, 0x65, 0x6c, 0x64, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x3b,
|
||||||
0x74, 0x27, 0x73, 0x20, 0x6a, 0x75, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63,
|
0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x69,
|
||||||
0x6f, 0x64, 0x65, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6a, 0x73, 0x6f, 0x6e,
|
0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20, 0x73,
|
||||||
0x20, 0x69, 0x6e, 0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, 0x20, 0x20,
|
0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66, 0x72,
|
||||||
0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61,
|
0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20, 0x77,
|
||||||
0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61,
|
0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b,
|
||||||
0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
|
0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||||
0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64,
|
||||||
0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x72,
|
0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b, 0x0a,
|
||||||
0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63,
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
|
||||||
0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20,
|
0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e,
|
||||||
0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x61, 0x63,
|
0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
|
||||||
0x6b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28,
|
0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20,
|
||||||
0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x7b, 0x0a,
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e,
|
||||||
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74,
|
0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
|
||||||
0x20, 0x3d, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x28,
|
0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c,
|
||||||
0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x29, 0x20, 0x21, 0x3d, 0x20, 0x66,
|
0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
|
||||||
0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
|
||||||
0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
|
0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||||
0x69, 0x66, 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20,
|
0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72,
|
||||||
0x73, 0x74, 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66,
|
0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
|
||||||
0x72, 0x6f, 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20,
|
0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63,
|
||||||
0x77, 0x65, 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61,
|
0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20,
|
||||||
0x6b, 0x20, 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
|
0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, 0x6d,
|
||||||
0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e,
|
0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, 0x74,
|
||||||
0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, 0x29, 0x20, 0x7b,
|
0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
|
||||||
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65,
|
0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e,
|
||||||
0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
|
0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
|
||||||
0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61,
|
0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65,
|
||||||
0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
|
0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
|
||||||
0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, 0x72,
|
0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, 0x20,
|
||||||
0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x20, 0x65,
|
0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x20,
|
||||||
0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x65, 0x29, 0x3b,
|
0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f,
|
||||||
0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65,
|
0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29,
|
||||||
0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61,
|
0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74,
|
||||||
0x6c, 0x6c, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f,
|
0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b,
|
||||||
0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f,
|
0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
|
||||||
0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20,
|
0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
|
||||||
0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74,
|
0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, 0x74,
|
||||||
0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x7d, 0x0a
|
0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x79,
|
||||||
|
0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x63, 0x72,
|
||||||
|
0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f,
|
||||||
|
0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f,
|
||||||
|
0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72,
|
||||||
|
0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65,
|
||||||
|
0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, 0x20, 0x66,
|
||||||
|
0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65,
|
||||||
|
0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a,
|
||||||
|
0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
|
||||||
|
0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
|
||||||
|
0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28,
|
||||||
|
0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20,
|
||||||
|
0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76,
|
||||||
|
0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28,
|
||||||
|
0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20, 0x28,
|
||||||
|
0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
|
||||||
|
0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75,
|
||||||
|
0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, 0x28, 0x63,
|
||||||
|
0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x2e,
|
||||||
|
0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20,
|
||||||
|
0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70,
|
||||||
|
0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c,
|
||||||
|
0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67,
|
||||||
|
0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74,
|
||||||
|
0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b,
|
||||||
|
0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20,
|
||||||
|
0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63,
|
||||||
|
0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
|
||||||
|
0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
|
||||||
|
0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x28, 0x29,
|
||||||
|
0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28,
|
||||||
|
0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c,
|
||||||
|
0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
|
||||||
|
0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72,
|
||||||
|
0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73,
|
||||||
|
0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c,
|
||||||
|
0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c,
|
||||||
|
0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e,
|
||||||
|
0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
|
||||||
|
0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e,
|
||||||
|
0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
|
||||||
|
0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
|
||||||
|
0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74,
|
||||||
|
0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20,
|
||||||
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74,
|
||||||
|
0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61,
|
||||||
|
0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77,
|
||||||
|
0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74,
|
||||||
|
0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, 0x20,
|
||||||
|
0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68,
|
||||||
|
0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x29, 0x29,
|
||||||
|
0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
|
||||||
|
0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e,
|
||||||
|
0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72,
|
||||||
|
0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e,
|
||||||
|
0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||||
|
0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
|
||||||
|
0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
|
||||||
|
0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
|
||||||
|
0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, 0x65, 0x6e,
|
||||||
|
0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
|
||||||
|
0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74,
|
||||||
|
0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64,
|
||||||
|
0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69,
|
||||||
|
0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20,
|
||||||
|
0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
|
||||||
|
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63,
|
||||||
|
0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69,
|
||||||
|
0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
|
||||||
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61,
|
||||||
|
0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63,
|
||||||
|
0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43,
|
||||||
|
0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22,
|
||||||
|
0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20,
|
||||||
|
0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e,
|
||||||
|
0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e,
|
||||||
|
0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
|
||||||
|
0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20,
|
||||||
|
0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65,
|
||||||
|
0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76,
|
||||||
|
0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74,
|
||||||
|
0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, 0x6f, 0x6e,
|
||||||
|
0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c,
|
||||||
|
0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20,
|
||||||
|
0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x28,
|
||||||
|
0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
|
||||||
|
0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b,
|
||||||
|
0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, 0x20,
|
||||||
|
0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72,
|
||||||
|
0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20,
|
||||||
|
0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65,
|
||||||
|
0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6d,
|
||||||
|
0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e,
|
||||||
|
0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, 0x20, 0x6e,
|
||||||
|
0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x73,
|
||||||
|
0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, 0x2f, 0x0a,
|
||||||
|
0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a,
|
||||||
|
0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c,
|
||||||
|
0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70,
|
||||||
|
0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28,
|
||||||
|
0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e,
|
||||||
|
0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
|
||||||
|
0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69,
|
||||||
|
0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a,
|
||||||
|
0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f,
|
||||||
|
0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, 0x0a, 0x2f,
|
||||||
|
0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
|
||||||
|
0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d,
|
||||||
|
0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
|
||||||
|
0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x6d,
|
||||||
|
0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64,
|
||||||
|
0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74,
|
||||||
|
0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f,
|
||||||
|
0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e,
|
||||||
|
0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d,
|
||||||
|
0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70,
|
||||||
|
0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20,
|
||||||
|
0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d,
|
||||||
|
0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
|
||||||
|
0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x50,
|
||||||
|
0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63,
|
||||||
|
0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, 0x20, 0x72,
|
||||||
|
0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a,
|
||||||
|
0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74,
|
||||||
|
0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20,
|
||||||
|
0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
|
||||||
|
0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20,
|
||||||
|
0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b,
|
||||||
|
0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72,
|
||||||
|
0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73,
|
||||||
|
0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b,
|
||||||
|
0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e,
|
||||||
|
0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e,
|
||||||
|
0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65,
|
||||||
|
0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a,
|
||||||
|
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76,
|
||||||
|
0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a,
|
||||||
|
0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20,
|
||||||
|
0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
|
||||||
|
0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x65,
|
||||||
|
0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
|
||||||
|
0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x2f,
|
||||||
|
0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, 0x72, 0x65,
|
||||||
|
0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, 0x0a, 0x65,
|
||||||
|
0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
|
||||||
|
0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74,
|
||||||
|
0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x70,
|
||||||
|
0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72,
|
||||||
|
0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62,
|
||||||
|
0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20,
|
||||||
|
0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63,
|
||||||
|
0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f,
|
||||||
|
0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, 0x72, 0x61,
|
||||||
|
0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70,
|
||||||
|
0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e,
|
||||||
|
0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, 0x29, 0x20,
|
||||||
|
0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, 0x62, 0x61,
|
||||||
|
0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, 0x0a, 0x20,
|
||||||
|
0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, 0x65, 0x74,
|
||||||
|
0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x20, 0x69,
|
||||||
|
0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x68, 0x65,
|
||||||
|
0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, 0x68, 0x69,
|
||||||
|
0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, 0x6c, 0x20,
|
||||||
|
0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20,
|
||||||
|
0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20,
|
||||||
|
0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73,
|
||||||
|
0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74,
|
||||||
|
0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61,
|
||||||
|
0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, 0x20,
|
||||||
|
0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20,
|
||||||
|
0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, 0x65, 0x6e,
|
||||||
|
0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74,
|
||||||
|
0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20,
|
||||||
|
0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73,
|
||||||
|
0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, 0x61, 0x77,
|
||||||
|
0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f,
|
||||||
|
0x6d, 0x6f, 0x64, 0x65, 0x6c, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x22, 0x29,
|
||||||
|
0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, 0x72,
|
||||||
|
0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20,
|
||||||
|
0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67,
|
||||||
|
0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65,
|
||||||
|
0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a
|
||||||
};
|
};
|
||||||
unsigned int completion_js_len = 2275;
|
unsigned int completion_js_len = 4462;
|
||||||
|
|
|
@ -4,10 +4,6 @@
|
||||||
# get the directory of this script file
|
# get the directory of this script file
|
||||||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||||
PUBLIC=$DIR/public
|
PUBLIC=$DIR/public
|
||||||
OUTPUT=$DIR/templats.hpp
|
|
||||||
|
|
||||||
echo "// Generated file, do not edit" > $OUTPUT
|
|
||||||
echo "" > $OUTPUT
|
|
||||||
|
|
||||||
echo "download js bundle files"
|
echo "download js bundle files"
|
||||||
curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
|
curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -5,20 +5,29 @@ const paramDefaults = {
|
||||||
stop: ["</s>"]
|
stop: ["</s>"]
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
let generation_settings = null;
|
||||||
* This function completes the input text using a llama dictionary.
|
|
||||||
* @param {object} params - The parameters for the completion request.
|
|
||||||
* @param {object} controller - an instance of AbortController if you need one, or null.
|
// Completes the prompt as a generator. Recommended for most use cases.
|
||||||
* @param {function} callback - The callback function to call when the completion is done.
|
//
|
||||||
* @returns {string} the completed text as a string. Ideally ignored, and you get at it via the callback.
|
// Example:
|
||||||
*/
|
//
|
||||||
export const llamaComplete = async (params, controller, callback) => {
|
// import { llama } from '/completion.js'
|
||||||
|
//
|
||||||
|
// const request = llama("Tell me a joke", {n_predict: 800})
|
||||||
|
// for await (const chunk of request) {
|
||||||
|
// document.write(chunk.data.content)
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
export async function* llama(prompt, params = {}, config = {}) {
|
||||||
|
let controller = config.controller;
|
||||||
|
|
||||||
if (!controller) {
|
if (!controller) {
|
||||||
controller = new AbortController();
|
controller = new AbortController();
|
||||||
}
|
}
|
||||||
const completionParams = { ...paramDefaults, ...params };
|
|
||||||
|
|
||||||
// we use fetch directly here becasue the built in fetchEventSource does not support POST
|
const completionParams = { ...paramDefaults, ...params, prompt };
|
||||||
|
|
||||||
const response = await fetch("/completion", {
|
const response = await fetch("/completion", {
|
||||||
method: 'POST',
|
method: 'POST',
|
||||||
body: JSON.stringify(completionParams),
|
body: JSON.stringify(completionParams),
|
||||||
|
@ -36,7 +45,6 @@ export const llamaComplete = async (params, controller, callback) => {
|
||||||
let content = "";
|
let content = "";
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|
||||||
let cont = true;
|
let cont = true;
|
||||||
|
|
||||||
while (cont) {
|
while (cont) {
|
||||||
|
@ -59,18 +67,21 @@ export const llamaComplete = async (params, controller, callback) => {
|
||||||
result.data = JSON.parse(result.data);
|
result.data = JSON.parse(result.data);
|
||||||
content += result.data.content;
|
content += result.data.content;
|
||||||
|
|
||||||
// callack
|
// yield
|
||||||
if (callback) {
|
yield result;
|
||||||
cont = callback(result) != false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if we got a stop token from server, we will break here
|
// if we got a stop token from server, we will break here
|
||||||
if (result.data.stop) {
|
if (result.data.stop) {
|
||||||
|
if (result.data.generation_settings) {
|
||||||
|
generation_settings = result.data.generation_settings;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error("llama error: ", e);
|
if (e.name !== 'AbortError') {
|
||||||
|
console.error("llama error: ", e);
|
||||||
|
}
|
||||||
throw e;
|
throw e;
|
||||||
}
|
}
|
||||||
finally {
|
finally {
|
||||||
|
@ -79,3 +90,79 @@ export const llamaComplete = async (params, controller, callback) => {
|
||||||
|
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Call llama, return an event target that you can subcribe to
|
||||||
|
//
|
||||||
|
// Example:
|
||||||
|
//
|
||||||
|
// import { llamaEventTarget } from '/completion.js'
|
||||||
|
//
|
||||||
|
// const conn = llamaEventTarget(prompt)
|
||||||
|
// conn.addEventListener("message", (chunk) => {
|
||||||
|
// document.write(chunk.detail.content)
|
||||||
|
// })
|
||||||
|
//
|
||||||
|
export const llamaEventTarget = (prompt, params = {}, config = {}) => {
|
||||||
|
const eventTarget = new EventTarget();
|
||||||
|
(async () => {
|
||||||
|
let content = "";
|
||||||
|
for await (const chunk of llama(prompt, params, config)) {
|
||||||
|
if (chunk.data) {
|
||||||
|
content += chunk.data.content;
|
||||||
|
eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data }));
|
||||||
|
}
|
||||||
|
if (chunk.data.generation_settings) {
|
||||||
|
eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings }));
|
||||||
|
}
|
||||||
|
if (chunk.data.timings) {
|
||||||
|
eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings }));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } }));
|
||||||
|
})();
|
||||||
|
return eventTarget;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Call llama, return a promise that resolves to the completed text. This does not support streaming
|
||||||
|
//
|
||||||
|
// Example:
|
||||||
|
//
|
||||||
|
// llamaPromise(prompt).then((content) => {
|
||||||
|
// document.write(content)
|
||||||
|
// })
|
||||||
|
//
|
||||||
|
// or
|
||||||
|
//
|
||||||
|
// const content = await llamaPromise(prompt)
|
||||||
|
// document.write(content)
|
||||||
|
//
|
||||||
|
export const llamaPromise = (prompt, params = {}, config = {}) => {
|
||||||
|
return new Promise(async (resolve, reject) => {
|
||||||
|
let content = "";
|
||||||
|
try {
|
||||||
|
for await (const chunk of llama(prompt, params, config)) {
|
||||||
|
content += chunk.data.content;
|
||||||
|
}
|
||||||
|
resolve(content);
|
||||||
|
} catch (error) {
|
||||||
|
reject(error);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* (deprecated)
|
||||||
|
*/
|
||||||
|
export const llamaComplete = async (params, controller, callback) => {
|
||||||
|
for await (const chunk of llama(params.prompt, params, { controller })) {
|
||||||
|
callback(chunk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get the model info from the server. This is useful for getting the context window and so on.
|
||||||
|
export const llamaModelInfo = async () => {
|
||||||
|
if (!generation_settings) {
|
||||||
|
generation_settings = await fetch("/model.json").then(r => r.json());
|
||||||
|
}
|
||||||
|
return generation_settings;
|
||||||
|
}
|
||||||
|
|
|
@ -6,7 +6,6 @@
|
||||||
<title>llama.cpp - chat</title>
|
<title>llama.cpp - chat</title>
|
||||||
|
|
||||||
<style>
|
<style>
|
||||||
|
|
||||||
body {
|
body {
|
||||||
background-color: #fff;
|
background-color: #fff;
|
||||||
color: #000;
|
color: #000;
|
||||||
|
@ -22,10 +21,6 @@
|
||||||
height: 100%;
|
height: 100%;
|
||||||
}
|
}
|
||||||
|
|
||||||
header, footer {
|
|
||||||
text-align: center;
|
|
||||||
}
|
|
||||||
|
|
||||||
main {
|
main {
|
||||||
margin: 3px;
|
margin: 3px;
|
||||||
display: flex;
|
display: flex;
|
||||||
|
@ -99,6 +94,15 @@
|
||||||
margin: 0.5em 0;
|
margin: 0.5em 0;
|
||||||
display: block;
|
display: block;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
header, footer {
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
footer {
|
||||||
|
font-size: 80%;
|
||||||
|
color: #888;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
|
|
||||||
<script type="module">
|
<script type="module">
|
||||||
|
@ -106,10 +110,10 @@
|
||||||
html, h, signal, effect, computed, render, useSignal, useEffect, useRef
|
html, h, signal, effect, computed, render, useSignal, useEffect, useRef
|
||||||
} from '/index.js';
|
} from '/index.js';
|
||||||
|
|
||||||
import { llamaComplete } from '/completion.js';
|
import { llama } from '/completion.js';
|
||||||
|
|
||||||
const session = signal({
|
const session = signal({
|
||||||
prompt: "This is a conversation between user and llama, a friendly chatbot. respond in markdown.",
|
prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
|
||||||
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
|
template: "{{prompt}}\n\n{{history}}\n{{char}}:",
|
||||||
historyTemplate: "{{name}}: {{message}}",
|
historyTemplate: "{{name}}: {{message}}",
|
||||||
transcript: [],
|
transcript: [],
|
||||||
|
@ -118,15 +122,6 @@
|
||||||
user: "User",
|
user: "User",
|
||||||
})
|
})
|
||||||
|
|
||||||
const transcriptUpdate = (transcript) => {
|
|
||||||
session.value = {
|
|
||||||
...session.value,
|
|
||||||
transcript
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const chatStarted = computed(() => session.value.transcript.length > 0)
|
|
||||||
|
|
||||||
const params = signal({
|
const params = signal({
|
||||||
n_predict: 400,
|
n_predict: 400,
|
||||||
temperature: 0.7,
|
temperature: 0.7,
|
||||||
|
@ -136,8 +131,18 @@
|
||||||
top_p: 0.5,
|
top_p: 0.5,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
const llamaStats = signal(null)
|
||||||
const controller = signal(null)
|
const controller = signal(null)
|
||||||
|
|
||||||
const generating = computed(() => controller.value == null )
|
const generating = computed(() => controller.value == null )
|
||||||
|
const chatStarted = computed(() => session.value.transcript.length > 0)
|
||||||
|
|
||||||
|
const transcriptUpdate = (transcript) => {
|
||||||
|
session.value = {
|
||||||
|
...session.value,
|
||||||
|
transcript
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// simple template replace
|
// simple template replace
|
||||||
const template = (str, extraSettings) => {
|
const template = (str, extraSettings) => {
|
||||||
|
@ -158,7 +163,7 @@
|
||||||
|
|
||||||
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
|
transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])
|
||||||
|
|
||||||
const payload = template(session.value.template, {
|
const prompt = template(session.value.template, {
|
||||||
message: msg,
|
message: msg,
|
||||||
history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"),
|
history: session.value.transcript.flatMap(([name, message]) => template(session.value.historyTemplate, {name, message})).join("\n"),
|
||||||
});
|
});
|
||||||
|
@ -168,22 +173,26 @@
|
||||||
|
|
||||||
const llamaParams = {
|
const llamaParams = {
|
||||||
...params.value,
|
...params.value,
|
||||||
prompt: payload,
|
|
||||||
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
|
stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
|
||||||
}
|
}
|
||||||
|
|
||||||
await llamaComplete(llamaParams, controller.value, (message) => {
|
for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
|
||||||
const data = message.data;
|
const data = chunk.data;
|
||||||
currentMessage += data.content;
|
currentMessage += data.content;
|
||||||
|
|
||||||
// remove leading whitespace
|
// remove leading whitespace
|
||||||
currentMessage = currentMessage.replace(/^\s+/, "")
|
currentMessage = currentMessage.replace(/^\s+/, "")
|
||||||
|
|
||||||
transcriptUpdate([...history, ["{{char}}", currentMessage]])
|
transcriptUpdate([...history, ["{{char}}", currentMessage]])
|
||||||
|
|
||||||
if (data.stop) {
|
if (data.stop) {
|
||||||
console.log("-->", data, ' response was:', currentMessage, 'transcript state:', session.value.transcript);
|
console.log("Completion finished: '", currentMessage, "', summary: ", data);
|
||||||
}
|
}
|
||||||
})
|
|
||||||
|
if (data.timings) {
|
||||||
|
llamaStats.value = data.timings;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
controller.value = null;
|
controller.value = null;
|
||||||
}
|
}
|
||||||
|
@ -219,13 +228,12 @@
|
||||||
return html`
|
return html`
|
||||||
<form onsubmit=${submit}>
|
<form onsubmit=${submit}>
|
||||||
<div>
|
<div>
|
||||||
<textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
|
<textarea type="text" rows=2 onkeypress=${enterSubmits} value="${message}" oninput=${(e) => message.value = e.target.value} placeholder="Say something..."/>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
<div class="right">
|
<div class="right">
|
||||||
<button type="submit" disabled=${!generating.value} >Send</button>
|
<button type="submit" disabled=${!generating.value} >Send</button>
|
||||||
<button onclick=${stop} disabled=${generating}>Stop</button>
|
<button onclick=${stop} disabled=${generating}>Stop</button>
|
||||||
<button onclick=${reset}>Reset</button>
|
<button onclick=${reset}>Reset</button>
|
||||||
</div>
|
</div>
|
||||||
</form>
|
</form>
|
||||||
`
|
`
|
||||||
|
@ -243,7 +251,7 @@
|
||||||
}, [messages])
|
}, [messages])
|
||||||
|
|
||||||
const chatLine = ([user, msg]) => {
|
const chatLine = ([user, msg]) => {
|
||||||
return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdown} text=${template(msg)} /></p>`
|
return html`<p key=${msg}><strong>${template(user)}:</strong> <${Markdownish} text=${template(msg)} /></p>`
|
||||||
};
|
};
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
|
@ -313,39 +321,52 @@
|
||||||
</form>
|
</form>
|
||||||
`
|
`
|
||||||
}
|
}
|
||||||
const Markdown = (params) => {
|
// poor mans markdown replacement
|
||||||
const md = params.text
|
const Markdownish = (params) => {
|
||||||
.replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
|
const md = params.text
|
||||||
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
|
.replace(/^#{1,6} (.*)$/gim, '<h3>$1</h3>')
|
||||||
.replace(/__(.*?)__/g, '<strong>$1</strong>')
|
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
|
||||||
.replace(/\*(.*?)\*/g, '<em>$1</em>')
|
.replace(/__(.*?)__/g, '<strong>$1</strong>')
|
||||||
.replace(/_(.*?)_/g, '<em>$1</em>')
|
.replace(/\*(.*?)\*/g, '<em>$1</em>')
|
||||||
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
.replace(/_(.*?)_/g, '<em>$1</em>')
|
||||||
.replace(/`(.*?)`/g, '<code>$1</code>')
|
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
|
||||||
.replace(/\n/gim, '<br />');
|
.replace(/`(.*?)`/g, '<code>$1</code>')
|
||||||
return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
|
.replace(/\n/gim, '<br />');
|
||||||
};
|
return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`;
|
||||||
|
};
|
||||||
|
|
||||||
|
const ModelGenerationInfo = (params) => {
|
||||||
|
if (!llamaStats.value) {
|
||||||
|
return html`<span/>`
|
||||||
|
}
|
||||||
|
return html`
|
||||||
|
<span>
|
||||||
|
${llamaStats.value.predicted_per_token_ms.toFixed()}ms per token, ${llamaStats.value.predicted_per_second.toFixed(2)} tokens per second
|
||||||
|
</span>
|
||||||
|
`
|
||||||
|
}
|
||||||
|
|
||||||
function App(props) {
|
function App(props) {
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
<div id="container">
|
<div id="container">
|
||||||
<header>
|
<header>
|
||||||
<h1>llama.cpp</h1>
|
<h1>llama.cpp</h1>
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
<main id="content">
|
<main id="content">
|
||||||
<${chatStarted.value ? ChatLog : ConfigForm} />
|
<${chatStarted.value ? ChatLog : ConfigForm} />
|
||||||
</main>
|
</main>
|
||||||
|
|
||||||
<footer id="write">
|
<section id="write">
|
||||||
<${MessageInput} />
|
<${MessageInput} />
|
||||||
</footer>
|
</section>
|
||||||
|
|
||||||
<footer>
|
<footer>
|
||||||
<p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a></p>
|
<p><${ModelGenerationInfo} /></p>
|
||||||
</footer>
|
<p>Powered by <a href="https://github.com/ggerganov/llama.cpp">llama.cpp</a> and <a href="https://ggml.ai">ggml.ai</a>.</p>
|
||||||
</div>
|
</footer>
|
||||||
|
</div>
|
||||||
`;
|
`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -60,6 +60,17 @@ float frand_uniform(struct random_uniform_distribution * rnd) {
|
||||||
return rnd->rd(rnd->gen);
|
return rnd->rd(rnd->gen);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
|
||||||
|
if (plan.work_size > 0) {
|
||||||
|
buf.resize(plan.work_size);
|
||||||
|
plan.work_data = buf.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_graph_compute(graph, &plan);
|
||||||
|
}
|
||||||
|
|
||||||
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
|
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
|
||||||
float scale = 1.0f; // xavier
|
float scale = 1.0f; // xavier
|
||||||
switch (tensor->n_dims) {
|
switch (tensor->n_dims) {
|
||||||
|
@ -1426,11 +1437,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
|
||||||
|
|
||||||
gf->n_nodes = 0;
|
gf->n_nodes = 0;
|
||||||
gf->n_leafs = 0;
|
gf->n_leafs = 0;
|
||||||
gf->work_size = 0;
|
|
||||||
gf->perf_runs = 0;
|
gf->perf_runs = 0;
|
||||||
gf->perf_cycles = 0;
|
gf->perf_cycles = 0;
|
||||||
gf->perf_time_us = 0;
|
gf->perf_time_us = 0;
|
||||||
gf->work = NULL;
|
|
||||||
|
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
//const int n_ctx = hparams.n_ctx;
|
//const int n_ctx = hparams.n_ctx;
|
||||||
|
@ -3162,6 +3171,7 @@ int main(int argc, char ** argv) {
|
||||||
printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
|
printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
|
||||||
// ggml_print_tensor_objects(model.ctx);
|
// ggml_print_tensor_objects(model.ctx);
|
||||||
|
|
||||||
|
// TODO: use std::vector<uint8_t> intead of "new"
|
||||||
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
|
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
|
||||||
uint8_t * compute_addr = new uint8_t[compute_size];
|
uint8_t * compute_addr = new uint8_t[compute_size];
|
||||||
|
|
||||||
|
@ -3183,6 +3193,8 @@ int main(int argc, char ** argv) {
|
||||||
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
|
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
printf("%s: begin training\n", __func__);
|
printf("%s: begin training\n", __func__);
|
||||||
|
|
||||||
for (int ex = 0; ex < params.n_examples; ++ex) {
|
for (int ex = 0; ex < params.n_examples; ++ex) {
|
||||||
|
@ -3217,9 +3229,6 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
|
||||||
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
|
||||||
|
|
||||||
// ggml_cgraph gf = {};
|
|
||||||
gf->n_threads = params.n_threads;
|
|
||||||
gb->n_threads = params.n_threads;
|
|
||||||
|
|
||||||
get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs);
|
get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs);
|
||||||
|
|
||||||
|
@ -3248,7 +3257,7 @@ int main(int argc, char ** argv) {
|
||||||
*gb = ggml_build_backward(ctx0, gf, true);
|
*gb = ggml_build_backward(ctx0, gf, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, gf);
|
ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
|
||||||
|
|
||||||
size_t used_mem_before_opt = ggml_used_mem(ctx0);
|
size_t used_mem_before_opt = ggml_used_mem(ctx0);
|
||||||
|
|
||||||
|
@ -3272,7 +3281,7 @@ int main(int argc, char ** argv) {
|
||||||
model.train_samples += n_batch;
|
model.train_samples += n_batch;
|
||||||
model.train_tokens += n_batch * n_tokens;
|
model.train_tokens += n_batch * n_tokens;
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, gf);
|
ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
|
||||||
|
|
||||||
float error_after_opt = ggml_get_f32_1d(loss, 0);
|
float error_after_opt = ggml_get_f32_1d(loss, 0);
|
||||||
|
|
||||||
|
@ -3354,13 +3363,12 @@ int main(int argc, char ** argv) {
|
||||||
struct ggml_context * ctx0 = ggml_init(cparams);
|
struct ggml_context * ctx0 = ggml_init(cparams);
|
||||||
|
|
||||||
ggml_cgraph gf = {};
|
ggml_cgraph gf = {};
|
||||||
gf.n_threads = params.n_threads;
|
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, logits);
|
ggml_build_forward_expand(&gf, logits);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_helper(work_buffer, &gf, params.n_threads);
|
||||||
|
|
||||||
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
|
||||||
//struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
//struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
|
||||||
|
@ -3386,6 +3394,7 @@ int main(int argc, char ** argv) {
|
||||||
delete[] compute_addr;
|
delete[] compute_addr;
|
||||||
delete[] compute_buf_0;
|
delete[] compute_buf_0;
|
||||||
delete[] compute_buf_1;
|
delete[] compute_buf_1;
|
||||||
|
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(lmodel);
|
llama_free_model(lmodel);
|
||||||
ggml_free(model.ctx);
|
ggml_free(model.ctx);
|
||||||
|
|
546
ggml-cuda.cu
546
ggml-cuda.cu
|
@ -115,8 +115,8 @@ typedef float2 dfloat2;
|
||||||
#endif //GGML_CUDA_DMMV_F16
|
#endif //GGML_CUDA_DMMV_F16
|
||||||
|
|
||||||
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
|
||||||
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
|
typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
|
||||||
typedef void (*dot_kernel_k_t)(const void * vx, const int ib, const int iqs, const float * y, float & v);
|
typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
|
||||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||||
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||||
typedef void (*ggml_cuda_op_t)(
|
typedef void (*ggml_cuda_op_t)(
|
||||||
|
@ -126,9 +126,11 @@ typedef void (*ggml_cuda_op_t)(
|
||||||
|
|
||||||
// QK = number of values after dequantization
|
// QK = number of values after dequantization
|
||||||
// QR = QK / number of values before dequantization
|
// QR = QK / number of values before dequantization
|
||||||
|
// QI = number of 32 bit integers before dequantization
|
||||||
|
|
||||||
#define QK4_0 32
|
#define QK4_0 32
|
||||||
#define QR4_0 2
|
#define QR4_0 2
|
||||||
|
#define QI4_0 4
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // delta
|
half d; // delta
|
||||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||||
|
@ -137,6 +139,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
||||||
|
|
||||||
#define QK4_1 32
|
#define QK4_1 32
|
||||||
#define QR4_1 2
|
#define QR4_1 2
|
||||||
|
#define QI4_1 4
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // delta
|
half d; // delta
|
||||||
half m; // min
|
half m; // min
|
||||||
|
@ -146,6 +149,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
||||||
|
|
||||||
#define QK5_0 32
|
#define QK5_0 32
|
||||||
#define QR5_0 2
|
#define QR5_0 2
|
||||||
|
#define QI5_0 4
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // delta
|
half d; // delta
|
||||||
uint8_t qh[4]; // 5-th bit of quants
|
uint8_t qh[4]; // 5-th bit of quants
|
||||||
|
@ -155,6 +159,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
||||||
|
|
||||||
#define QK5_1 32
|
#define QK5_1 32
|
||||||
#define QR5_1 2
|
#define QR5_1 2
|
||||||
|
#define QI5_1 4
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // delta
|
half d; // delta
|
||||||
half m; // min
|
half m; // min
|
||||||
|
@ -165,12 +170,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
||||||
|
|
||||||
#define QK8_0 32
|
#define QK8_0 32
|
||||||
#define QR8_0 1
|
#define QR8_0 1
|
||||||
|
#define QI8_0 8
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // delta
|
half d; // delta
|
||||||
int8_t qs[QK8_0]; // quants
|
int8_t qs[QK8_0]; // quants
|
||||||
} block_q8_0;
|
} block_q8_0;
|
||||||
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
||||||
|
|
||||||
|
#define QK8_1 32
|
||||||
|
#define QR8_1 1
|
||||||
|
#define QI8_1 8
|
||||||
|
typedef struct {
|
||||||
|
half d; // delta
|
||||||
|
half s; // unquantized sum
|
||||||
|
int8_t qs[QK8_0]; // quants
|
||||||
|
} block_q8_1;
|
||||||
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
||||||
|
|
||||||
|
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs);
|
||||||
|
|
||||||
//================================= k-quants
|
//================================= k-quants
|
||||||
|
|
||||||
#ifdef GGML_QKK_64
|
#ifdef GGML_QKK_64
|
||||||
|
@ -246,6 +264,7 @@ typedef struct {
|
||||||
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
||||||
|
|
||||||
#define WARP_SIZE 32
|
#define WARP_SIZE 32
|
||||||
|
#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
||||||
|
|
||||||
#define CUDA_ADD_BLOCK_SIZE 256
|
#define CUDA_ADD_BLOCK_SIZE 256
|
||||||
#define CUDA_MUL_BLOCK_SIZE 256
|
#define CUDA_MUL_BLOCK_SIZE 256
|
||||||
|
@ -254,14 +273,15 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
||||||
#define CUDA_SCALE_BLOCK_SIZE 256
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
||||||
#define CUDA_ROPE_BLOCK_SIZE 256
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
||||||
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
||||||
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||||
|
|
||||||
// dmmv = dequantize_mul_mat_vec
|
// dmmv = dequantize_mul_mat_vec
|
||||||
#ifndef GGML_CUDA_DMMV_X
|
#ifndef GGML_CUDA_DMMV_X
|
||||||
#define GGML_CUDA_DMMV_X 32
|
#define GGML_CUDA_DMMV_X 32
|
||||||
#endif
|
#endif
|
||||||
#ifndef GGML_CUDA_DMMV_Y
|
#ifndef GGML_CUDA_MMV_Y
|
||||||
#define GGML_CUDA_DMMV_Y 1
|
#define GGML_CUDA_MMV_Y 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef K_QUANTS_PER_ITERATION
|
#ifndef K_QUANTS_PER_ITERATION
|
||||||
|
@ -326,7 +346,6 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums
|
// sum up partial sums
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -445,7 +464,7 @@ static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const in
|
||||||
|
|
||||||
//================================== k-quants
|
//================================== k-quants
|
||||||
|
|
||||||
static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
const block_q2_K * x = (const block_q2_K *) vx;
|
const block_q2_K * x = (const block_q2_K *) vx;
|
||||||
|
@ -478,7 +497,7 @@ static __global__ void dequantize_block_q2_K(const void * vx, float * yy) {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q3_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
const block_q3_K * x = (const block_q3_K *) vx;
|
const block_q3_K * x = (const block_q3_K *) vx;
|
||||||
|
@ -542,7 +561,7 @@ static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
||||||
const block_q4_K * x = (const block_q4_K *) vx;
|
const block_q4_K * x = (const block_q4_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
@ -582,7 +601,7 @@ static __global__ void dequantize_block_q4_K(const void * vx, float * yy) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
||||||
const block_q5_K * x = (const block_q5_K *) vx;
|
const block_q5_K * x = (const block_q5_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
@ -628,7 +647,7 @@ static __global__ void dequantize_block_q5_K(const void * vx, float * yy) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, float * __restrict__ yy) {
|
||||||
const block_q6_K * x = (const block_q6_K *) vx;
|
const block_q6_K * x = (const block_q6_K *) vx;
|
||||||
|
|
||||||
const int i = blockIdx.x;
|
const int i = blockIdx.x;
|
||||||
|
@ -672,7 +691,7 @@ static __global__ void dequantize_block_q6_K(const void * vx, float * yy) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||||
|
|
||||||
|
@ -770,7 +789,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -781,7 +799,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row > nrows) return;
|
||||||
|
@ -875,7 +893,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -886,7 +903,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row > nrows) return;
|
||||||
|
@ -979,7 +996,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -990,7 +1006,7 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float * yy, float * dst, const int ncols) {
|
static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) {
|
||||||
|
|
||||||
const int row = blockIdx.x;
|
const int row = blockIdx.x;
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
|
@ -1084,7 +1100,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1095,7 +1110,7 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float * yy, float * dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||||
|
|
||||||
|
@ -1195,7 +1210,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1214,8 +1228,43 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
||||||
v.y = x[ib + iqs + 1];
|
v.y = x[ib + iqs + 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int ndata, const int k) {
|
||||||
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (i >= k) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
block_q8_1 * y = (block_q8_1 *) vy;
|
||||||
|
|
||||||
|
const int ib = i / QK8_1; // block index
|
||||||
|
const int iqs = i % QK8_1; // quant index
|
||||||
|
|
||||||
|
const float xi = i < ndata ? x[i] : 0.0f;
|
||||||
|
float amax = fabsf(xi);
|
||||||
|
float sum = xi;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
|
||||||
|
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = amax / 127;
|
||||||
|
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
||||||
|
|
||||||
|
y[ib].qs[iqs] = q;
|
||||||
|
|
||||||
|
if (iqs > 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
y[ib].d = d;
|
||||||
|
y[ib].s = sum;
|
||||||
|
}
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||||
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
static __global__ void dequantize_block(const void * __restrict__ vx, float * __restrict__ y, const int k) {
|
||||||
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
||||||
|
|
||||||
if (i >= k) {
|
if (i >= k) {
|
||||||
|
@ -1235,8 +1284,184 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
||||||
y[iybs + iqs + y_offset] = v.y;
|
y[iybs + iqs + y_offset] = v.y;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||||
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
|
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
||||||
|
|
||||||
|
int vi;
|
||||||
|
memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
||||||
|
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
|
||||||
|
|
||||||
|
const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
|
||||||
|
|
||||||
|
// subtract 8 from each quantized value
|
||||||
|
const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
|
||||||
|
const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
|
||||||
|
|
||||||
|
// SIMD dot product of quantized values
|
||||||
|
int sumi = __dp4a(vi0, ui0, 0);
|
||||||
|
sumi = __dp4a(vi1, ui1, sumi);
|
||||||
|
|
||||||
|
return sumi*d;
|
||||||
|
#else
|
||||||
|
return 0.0f; // only to satisfy the compiler
|
||||||
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||||
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
|
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
||||||
|
|
||||||
|
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
|
||||||
|
|
||||||
|
const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
|
||||||
|
const float m = bq4_1->m;
|
||||||
|
const float s = bq8_1->s;
|
||||||
|
|
||||||
|
const int vi0 = (vi >> 0) & 0x0F0F0F0F;
|
||||||
|
const int vi1 = (vi >> 4) & 0x0F0F0F0F;
|
||||||
|
|
||||||
|
// SIMD dot product of quantized values
|
||||||
|
int sumi = __dp4a(vi0, ui0, 0);
|
||||||
|
sumi = __dp4a(vi1, ui1, sumi);
|
||||||
|
|
||||||
|
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
||||||
|
#else
|
||||||
|
return 0.0f; // only to satisfy the compiler
|
||||||
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||||
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
|
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
||||||
|
|
||||||
|
int qs;
|
||||||
|
memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
||||||
|
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
|
||||||
|
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
|
||||||
|
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
|
||||||
|
|
||||||
|
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
|
||||||
|
|
||||||
|
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
||||||
|
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
||||||
|
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
||||||
|
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
||||||
|
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
||||||
|
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
|
||||||
|
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
||||||
|
|
||||||
|
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
||||||
|
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
||||||
|
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
||||||
|
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
||||||
|
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
||||||
|
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
|
||||||
|
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
||||||
|
|
||||||
|
return sumi*d;
|
||||||
|
#else
|
||||||
|
return 0.0f; // only to satisfy the compiler
|
||||||
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||||
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
|
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
||||||
|
|
||||||
|
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
|
||||||
|
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
|
||||||
|
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
|
||||||
|
|
||||||
|
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
|
||||||
|
const float m = bq5_1->m;
|
||||||
|
const float s = bq8_1->s;
|
||||||
|
|
||||||
|
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
||||||
|
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
||||||
|
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
||||||
|
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
||||||
|
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
||||||
|
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
||||||
|
|
||||||
|
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
||||||
|
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
||||||
|
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
||||||
|
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
||||||
|
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
||||||
|
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
||||||
|
|
||||||
|
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
||||||
|
#else
|
||||||
|
return 0.0f; // only to satisfy the compiler
|
||||||
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
|
||||||
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
|
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
||||||
|
|
||||||
|
int vi;
|
||||||
|
memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
||||||
|
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
|
||||||
|
const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
|
||||||
|
|
||||||
|
// SIMD dot product of quantized values
|
||||||
|
int sumi = __dp4a(vi, ui, 0);
|
||||||
|
|
||||||
|
return sumi*d;
|
||||||
|
#else
|
||||||
|
return 0.0f; // only to satisfy the compiler
|
||||||
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||||
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||||
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
|
if (row >= nrows) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int blocks_per_row = ncols / qk;
|
||||||
|
const int blocks_per_warp = WARP_SIZE / qi;
|
||||||
|
|
||||||
|
// partial sum for each thread
|
||||||
|
float tmp = 0.0f;
|
||||||
|
|
||||||
|
const block_q_t * x = (const block_q_t *) vx;
|
||||||
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||||
|
|
||||||
|
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
||||||
|
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
||||||
|
|
||||||
|
const int iby = i + threadIdx.x / qi; // y block index
|
||||||
|
|
||||||
|
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
||||||
|
|
||||||
|
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum up partial sums and write back result
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
dst[row] = tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||||
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||||
// qk = quantized weights per x block
|
// qk = quantized weights per x block
|
||||||
// qr = number of quantized weights per data value in x block
|
// qr = number of quantized weights per data value in x block
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||||
|
@ -1289,7 +1514,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1304,7 +1528,7 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) {
|
||||||
const half * x = (const half *) vx;
|
const half * x = (const half *) vx;
|
||||||
|
|
||||||
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
||||||
|
@ -1340,7 +1564,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
||||||
const int idst = channel*nrows_dst + row_dst;
|
const int idst = channel*nrows_dst + row_dst;
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1352,7 +1575,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
||||||
}
|
}
|
||||||
|
|
||||||
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||||
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
||||||
const int row_stride_x, const int channel_stride_x) {
|
const int row_stride_x, const int channel_stride_x) {
|
||||||
|
|
||||||
const half * x = (const half *) vx;
|
const half * x = (const half *) vx;
|
||||||
|
@ -1386,7 +1609,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1496,7 +1718,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums
|
// sum up partial sums
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1550,6 +1771,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
|
||||||
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int ndata, const int k, cudaStream_t stream) {
|
||||||
|
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||||
|
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, ndata, k);
|
||||||
|
}
|
||||||
|
|
||||||
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||||
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
|
@ -1618,45 +1844,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
@ -1703,6 +1929,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
||||||
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
|
||||||
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
|
||||||
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
|
||||||
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
|
||||||
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
|
||||||
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
|
}
|
||||||
|
|
||||||
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||||
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
|
@ -1710,9 +1981,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
||||||
|
|
||||||
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<1, 1, convert_f16>
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
@ -1878,6 +2149,7 @@ static size_t g_scratch_offset = 0;
|
||||||
|
|
||||||
static int g_device_count = -1;
|
static int g_device_count = -1;
|
||||||
static int g_main_device = 0;
|
static int g_main_device = 0;
|
||||||
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
||||||
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
|
|
||||||
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
||||||
|
@ -1895,9 +2167,12 @@ void ggml_init_cublas() {
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||||
fprintf(stderr, " Device %d: %s\n", id, prop.name);
|
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
||||||
|
|
||||||
g_tensor_split[id] = total_vram;
|
g_tensor_split[id] = total_vram;
|
||||||
total_vram += prop.totalGlobalMem;
|
total_vram += prop.totalGlobalMem;
|
||||||
|
|
||||||
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
||||||
}
|
}
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
g_tensor_split[id] /= total_vram;
|
g_tensor_split[id] /= total_vram;
|
||||||
|
@ -2113,7 +2388,7 @@ inline void ggml_cuda_op_rms_norm(
|
||||||
(void) i1;
|
(void) i1;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
inline void ggml_cuda_op_mul_mat_vec(
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
||||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
||||||
cudaStream_t & cudaStream_main){
|
cudaStream_t & cudaStream_main){
|
||||||
|
@ -2125,69 +2400,115 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t nrows = i01_high - i01_low;
|
const int64_t nrows = i01_high - i01_low;
|
||||||
|
|
||||||
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
#ifdef GGML_CUDA_FORCE_DMMV
|
||||||
#ifdef GGML_CUDA_DMMV_F16
|
const bool use_mul_mat_vec_q = false;
|
||||||
size_t ash;
|
|
||||||
dfloat * src1_dfloat = nullptr; // dfloat == half
|
|
||||||
|
|
||||||
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
|
||||||
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
|
||||||
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
|
||||||
|
|
||||||
if (src1_convert_f16) {
|
|
||||||
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
|
||||||
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
|
||||||
ne00, 1, sizeof(float), 0, 0,
|
|
||||||
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
|
||||||
}
|
|
||||||
#else
|
#else
|
||||||
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
int id;
|
||||||
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
|
||||||
|
const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
|
||||||
|
src0->type == GGML_TYPE_Q4_1 ||
|
||||||
|
src0->type == GGML_TYPE_Q5_0 ||
|
||||||
|
src0->type == GGML_TYPE_Q5_1 ||
|
||||||
|
src0->type == GGML_TYPE_Q8_0;
|
||||||
|
|
||||||
|
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 600 && mul_mat_vec_q_implemented;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (use_mul_mat_vec_q) {
|
||||||
|
int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1;
|
||||||
|
padded_row_size -= padded_row_size % MATRIX_ROW_PADDING;
|
||||||
|
size_t as;
|
||||||
|
void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as);
|
||||||
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main);
|
||||||
|
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_cuda_pool_free(src1_q8_1, as);
|
||||||
|
} else {
|
||||||
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
||||||
|
#ifdef GGML_CUDA_DMMV_F16
|
||||||
|
size_t ash;
|
||||||
|
dfloat * src1_dfloat = nullptr; // dfloat == half
|
||||||
|
|
||||||
|
bool src1_convert_f16 = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
|
||||||
|
src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
|
||||||
|
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
|
||||||
|
|
||||||
|
if (src1_convert_f16) {
|
||||||
|
src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
|
||||||
|
ggml_cpy_f32_f16_cuda((char *) src1_ddf_i, (char *) src1_dfloat, ne00,
|
||||||
|
ne00, 1, sizeof(float), 0, 0,
|
||||||
|
ne00, 1, sizeof(half), 0, 0, cudaStream_main);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
dfloat * src1_dfloat = src1_ddf_i; // dfloat == float, no conversion
|
||||||
#endif // GGML_CUDA_DMMV_F16
|
#endif // GGML_CUDA_DMMV_F16
|
||||||
|
|
||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q4_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q4_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q5_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q5_1_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q8_0_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q2_K:
|
case GGML_TYPE_Q2_K:
|
||||||
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q2_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q3_K:
|
case GGML_TYPE_Q3_K:
|
||||||
dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q3_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q4_K:
|
case GGML_TYPE_Q4_K:
|
||||||
dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q4_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q5_K:
|
case GGML_TYPE_Q5_K:
|
||||||
dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q5_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_Q6_K:
|
case GGML_TYPE_Q6_K:
|
||||||
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
dequantize_mul_mat_vec_q6_K_cuda(src0_ddq_i, src1_ddf_i, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
convert_mul_mat_vec_f16_cuda(src0_ddq_i, src1_dfloat, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_CUDA_DMMV_F16
|
#ifdef GGML_CUDA_DMMV_F16
|
||||||
if (src1_convert_f16) {
|
if (src1_convert_f16) {
|
||||||
ggml_cuda_pool_free(src1_dfloat, ash);
|
ggml_cuda_pool_free(src1_dfloat, ash);
|
||||||
}
|
}
|
||||||
#endif // GGML_CUDA_DMMV_F16
|
#endif // GGML_CUDA_DMMV_F16
|
||||||
|
}
|
||||||
|
|
||||||
(void) src1;
|
(void) src1;
|
||||||
(void) dst;
|
(void) dst;
|
||||||
|
@ -2757,8 +3078,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
||||||
}else if (src0->type == GGML_TYPE_F32) {
|
}else if (src0->type == GGML_TYPE_F32) {
|
||||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
||||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
||||||
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
||||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
||||||
} else {
|
} else {
|
||||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
||||||
}
|
}
|
||||||
|
@ -2843,7 +3164,11 @@ void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
|
||||||
|
|
||||||
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
||||||
int nrows = ggml_nrows(tensor);
|
int nrows = ggml_nrows(tensor);
|
||||||
|
|
||||||
|
const int64_t ne0 = tensor->ne[0];
|
||||||
|
|
||||||
const size_t nb1 = tensor->nb[1];
|
const size_t nb1 = tensor->nb[1];
|
||||||
|
|
||||||
ggml_backend backend = tensor->backend;
|
ggml_backend backend = tensor->backend;
|
||||||
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
|
||||||
memset(extra, 0, sizeof(*extra));
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
@ -2872,11 +3197,24 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
|
||||||
int64_t nrows_split = row_high - row_low;
|
int64_t nrows_split = row_high - row_low;
|
||||||
|
|
||||||
const size_t offset_split = row_low*nb1;
|
const size_t offset_split = row_low*nb1;
|
||||||
const size_t size = ggml_nbytes_split(tensor, nrows_split);
|
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
||||||
|
const size_t original_size = size;
|
||||||
|
|
||||||
void * buf;
|
// pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses
|
||||||
|
if (ne0 % MATRIX_ROW_PADDING != 0) {
|
||||||
|
size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
|
||||||
|
* ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
|
||||||
|
}
|
||||||
|
|
||||||
|
char * buf;
|
||||||
CUDA_CHECK(cudaMalloc(&buf, size));
|
CUDA_CHECK(cudaMalloc(&buf, size));
|
||||||
void * buf_host = (char*)data + offset_split;
|
char * buf_host = (char*)data + offset_split;
|
||||||
|
|
||||||
|
// set padding to 0 to avoid possible NaN values
|
||||||
|
if (size > original_size) {
|
||||||
|
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
|
|
@ -34,9 +34,13 @@ extern "C" {
|
||||||
|
|
||||||
struct ggml_metal_context;
|
struct ggml_metal_context;
|
||||||
|
|
||||||
struct ggml_metal_context * ggml_metal_init(void);
|
// number of command buffers to use
|
||||||
|
struct ggml_metal_context * ggml_metal_init(int n_cb);
|
||||||
void ggml_metal_free(struct ggml_metal_context * ctx);
|
void ggml_metal_free(struct ggml_metal_context * ctx);
|
||||||
|
|
||||||
|
// set the number of command buffers to use
|
||||||
|
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
|
||||||
|
|
||||||
// creates a mapping between a host memory buffer and a device memory buffer
|
// creates a mapping between a host memory buffer and a device memory buffer
|
||||||
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
||||||
// - the mapping is used during computation to determine the arguments of the compute kernels
|
// - the mapping is used during computation to determine the arguments of the compute kernels
|
||||||
|
|
11
ggml-metal.m
11
ggml-metal.m
|
@ -25,6 +25,8 @@ struct ggml_metal_buffer {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_metal_context {
|
struct ggml_metal_context {
|
||||||
|
int n_cb;
|
||||||
|
|
||||||
float * logits;
|
float * logits;
|
||||||
|
|
||||||
id<MTLDevice> device;
|
id<MTLDevice> device;
|
||||||
|
@ -86,11 +88,12 @@ static NSString * const msl_library_source = @"see metal.metal";
|
||||||
@implementation GGMLMetalClass
|
@implementation GGMLMetalClass
|
||||||
@end
|
@end
|
||||||
|
|
||||||
struct ggml_metal_context * ggml_metal_init(void) {
|
struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||||
fprintf(stderr, "%s: allocating\n", __func__);
|
fprintf(stderr, "%s: allocating\n", __func__);
|
||||||
|
|
||||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||||
|
|
||||||
|
ctx->n_cb = n_cb;
|
||||||
ctx->device = MTLCreateSystemDefaultDevice();
|
ctx->device = MTLCreateSystemDefaultDevice();
|
||||||
ctx->queue = [ctx->device newCommandQueue];
|
ctx->queue = [ctx->device newCommandQueue];
|
||||||
ctx->n_buffers = 0;
|
ctx->n_buffers = 0;
|
||||||
|
@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||||
free(ctx);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
|
||||||
|
ctx->n_cb = n_cb;
|
||||||
|
}
|
||||||
|
|
||||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||||
// Metal buffer based on the host memory pointer
|
// Metal buffer based on the host memory pointer
|
||||||
|
@ -354,7 +361,7 @@ void ggml_metal_graph_compute(
|
||||||
// create multiple command buffers and enqueue them
|
// create multiple command buffers and enqueue them
|
||||||
// then, we encode the graph into the command buffers in parallel
|
// then, we encode the graph into the command buffers in parallel
|
||||||
|
|
||||||
const int n_cb = gf->n_threads;
|
const int n_cb = ctx->n_cb;
|
||||||
|
|
||||||
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
|
||||||
|
|
||||||
|
|
|
@ -653,13 +653,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||||
const int in = tid - step*im; // 0...15 or 0...7
|
const int in = tid - step*im; // 0...15 or 0...7
|
||||||
|
|
||||||
#if K_QUANTS_PER_ITERATION == 1
|
\n#if K_QUANTS_PER_ITERATION == 1\n
|
||||||
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
||||||
const int is = 0;
|
const int is = 0;
|
||||||
#else
|
|
||||||
|
\n#else\n
|
||||||
|
|
||||||
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
||||||
const int is = in / 4;
|
const int is = in / 4;
|
||||||
#endif
|
|
||||||
|
\n#endif\n
|
||||||
|
|
||||||
const int ql_offset = 64*im + l0;
|
const int ql_offset = 64*im + l0;
|
||||||
const int qh_offset = 32*im + l0;
|
const int qh_offset = 32*im + l0;
|
||||||
const int s_offset = 8*im + is;
|
const int s_offset = 8*im + is;
|
||||||
|
@ -676,7 +680,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||||
|
|
||||||
const float d = vload_half(0, &x[i].d);
|
const float d = vload_half(0, &x[i].d);
|
||||||
|
|
||||||
#if K_QUANTS_PER_ITERATION == 1
|
\n#if K_QUANTS_PER_ITERATION == 1\n
|
||||||
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
||||||
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
||||||
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
||||||
|
@ -686,7 +690,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||||
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
||||||
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
||||||
tmp[16 * ix + tid] += sum;
|
tmp[16 * ix + tid] += sum;
|
||||||
#else
|
\n#else\n
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
for (int l = 0; l < 4; ++l) {
|
for (int l = 0; l < 4; ++l) {
|
||||||
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
||||||
|
@ -695,7 +699,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||||
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
||||||
}
|
}
|
||||||
tmp[16 * ix + tid] += sum;
|
tmp[16 * ix + tid] += sum;
|
||||||
#endif
|
\n#endif\n
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
181
ggml.h
181
ggml.h
|
@ -65,7 +65,7 @@
|
||||||
// ggml_set_f32(a, 3.0f);
|
// ggml_set_f32(a, 3.0f);
|
||||||
// ggml_set_f32(b, 4.0f);
|
// ggml_set_f32(b, 4.0f);
|
||||||
//
|
//
|
||||||
// ggml_graph_compute(ctx0, &gf);
|
// ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
|
||||||
//
|
//
|
||||||
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
// printf("f = %f\n", ggml_get_f32_1d(f, 0));
|
||||||
//
|
//
|
||||||
|
@ -201,6 +201,8 @@
|
||||||
#define GGML_MAX_NAME 48
|
#define GGML_MAX_NAME 48
|
||||||
#define GGML_DEFAULT_N_THREADS 4
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
|
|
||||||
|
#define GGML_UNUSED(x) (void)(x)
|
||||||
|
|
||||||
#define GGML_ASSERT(x) \
|
#define GGML_ASSERT(x) \
|
||||||
do { \
|
do { \
|
||||||
if (!(x)) { \
|
if (!(x)) { \
|
||||||
|
@ -209,6 +211,30 @@
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
||||||
|
// main purpose is to reduce code duplication and improve readability.
|
||||||
|
//
|
||||||
|
// example:
|
||||||
|
//
|
||||||
|
// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
||||||
|
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
||||||
|
//
|
||||||
|
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
||||||
|
const type prefix##0 = (pointer)->array[0]; \
|
||||||
|
GGML_UNUSED(prefix##0);
|
||||||
|
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
||||||
|
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
||||||
|
const type prefix##1 = (pointer)->array[1]; \
|
||||||
|
GGML_UNUSED(prefix##1);
|
||||||
|
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
||||||
|
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
||||||
|
const type prefix##2 = (pointer)->array[2]; \
|
||||||
|
GGML_UNUSED(prefix##2);
|
||||||
|
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
||||||
|
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
||||||
|
const type prefix##3 = (pointer)->array[3]; \
|
||||||
|
GGML_UNUSED(prefix##3);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -224,8 +250,8 @@ extern "C" {
|
||||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
||||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
||||||
|
|
||||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
|
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
|
||||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);
|
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
|
||||||
|
|
||||||
struct ggml_object;
|
struct ggml_object;
|
||||||
struct ggml_context;
|
struct ggml_context;
|
||||||
|
@ -295,12 +321,15 @@ extern "C" {
|
||||||
GGML_OP_SUM,
|
GGML_OP_SUM,
|
||||||
GGML_OP_SUM_ROWS,
|
GGML_OP_SUM_ROWS,
|
||||||
GGML_OP_MEAN,
|
GGML_OP_MEAN,
|
||||||
|
GGML_OP_ARGMAX,
|
||||||
GGML_OP_REPEAT,
|
GGML_OP_REPEAT,
|
||||||
GGML_OP_REPEAT_BACK,
|
GGML_OP_REPEAT_BACK,
|
||||||
GGML_OP_ABS,
|
GGML_OP_ABS,
|
||||||
GGML_OP_SGN,
|
GGML_OP_SGN,
|
||||||
GGML_OP_NEG,
|
GGML_OP_NEG,
|
||||||
GGML_OP_STEP,
|
GGML_OP_STEP,
|
||||||
|
GGML_OP_TANH,
|
||||||
|
GGML_OP_ELU,
|
||||||
GGML_OP_RELU,
|
GGML_OP_RELU,
|
||||||
GGML_OP_GELU,
|
GGML_OP_GELU,
|
||||||
GGML_OP_GELU_QUICK,
|
GGML_OP_GELU_QUICK,
|
||||||
|
@ -332,9 +361,8 @@ extern "C" {
|
||||||
GGML_OP_ROPE_BACK,
|
GGML_OP_ROPE_BACK,
|
||||||
GGML_OP_ALIBI,
|
GGML_OP_ALIBI,
|
||||||
GGML_OP_CLAMP,
|
GGML_OP_CLAMP,
|
||||||
GGML_OP_CONV_1D_S1_PH,
|
GGML_OP_CONV_1D,
|
||||||
GGML_OP_CONV_1D_S2_PH,
|
GGML_OP_CONV_2D,
|
||||||
GGML_OP_CONV_2D_SK_P0,
|
|
||||||
|
|
||||||
GGML_OP_FLASH_ATTN,
|
GGML_OP_FLASH_ATTN,
|
||||||
GGML_OP_FLASH_FF,
|
GGML_OP_FLASH_FF,
|
||||||
|
@ -390,9 +418,6 @@ extern "C" {
|
||||||
struct ggml_tensor * src1;
|
struct ggml_tensor * src1;
|
||||||
struct ggml_tensor * opt[GGML_MAX_OPT];
|
struct ggml_tensor * opt[GGML_MAX_OPT];
|
||||||
|
|
||||||
// thread scheduling
|
|
||||||
int n_tasks;
|
|
||||||
|
|
||||||
// performance
|
// performance
|
||||||
int perf_runs;
|
int perf_runs;
|
||||||
int64_t perf_cycles;
|
int64_t perf_cycles;
|
||||||
|
@ -404,19 +429,27 @@ extern "C" {
|
||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
char padding[4];
|
char padding[8];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
|
||||||
|
// the compute plan that needs to be prepared for ggml_graph_compute()
|
||||||
|
// since https://github.com/ggerganov/ggml/issues/287
|
||||||
|
struct ggml_cplan {
|
||||||
|
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
|
||||||
|
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
|
||||||
|
|
||||||
|
int n_threads;
|
||||||
|
|
||||||
|
// the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
|
||||||
|
int n_tasks[GGML_MAX_NODES];
|
||||||
|
};
|
||||||
|
|
||||||
// computation graph
|
// computation graph
|
||||||
struct ggml_cgraph {
|
struct ggml_cgraph {
|
||||||
int n_nodes;
|
int n_nodes;
|
||||||
int n_leafs;
|
int n_leafs;
|
||||||
int n_threads;
|
|
||||||
|
|
||||||
size_t work_size;
|
|
||||||
struct ggml_tensor * work;
|
|
||||||
|
|
||||||
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
struct ggml_tensor * nodes[GGML_MAX_NODES];
|
||||||
struct ggml_tensor * grads[GGML_MAX_NODES];
|
struct ggml_tensor * grads[GGML_MAX_NODES];
|
||||||
|
@ -690,6 +723,11 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
// argmax along rows
|
||||||
|
GGML_API struct ggml_tensor * ggml_argmax(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// if a is the same shape as b, and a is not parameter, return a
|
// if a is the same shape as b, and a is not parameter, return a
|
||||||
// otherwise, return a new tensor: repeat(a) to fit in b
|
// otherwise, return a new tensor: repeat(a) to fit in b
|
||||||
GGML_API struct ggml_tensor * ggml_repeat(
|
GGML_API struct ggml_tensor * ggml_repeat(
|
||||||
|
@ -734,6 +772,22 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_tanh(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_tanh_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_elu(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_elu_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_relu(
|
GGML_API struct ggml_tensor * ggml_relu(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
@ -1084,58 +1138,33 @@ extern "C" {
|
||||||
float min,
|
float min,
|
||||||
float max);
|
float max);
|
||||||
|
|
||||||
// TODO: implement general-purpose convolutions
|
GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||||
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
||||||
// struct ggml_context * ctx,
|
|
||||||
// struct ggml_tensor * a,
|
|
||||||
// struct ggml_tensor * b,
|
|
||||||
// int s0
|
|
||||||
// int p0,
|
|
||||||
// int d0);
|
|
||||||
//
|
|
||||||
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
|
||||||
// struct ggml_context * ctx,
|
|
||||||
// struct ggml_tensor * a,
|
|
||||||
// struct ggml_tensor * b,
|
|
||||||
// int s0,
|
|
||||||
// int s1,
|
|
||||||
// int p0,
|
|
||||||
// int p1,
|
|
||||||
// int d0,
|
|
||||||
// int d1);
|
|
||||||
|
|
||||||
// padding = half
|
|
||||||
// TODO: we don't support extra parameters for now
|
|
||||||
// that's why we are hard-coding the stride, padding, and dilation
|
|
||||||
// not great ..
|
|
||||||
// example:
|
|
||||||
// a: 3 80 768 1
|
|
||||||
// b: 3000 80 1 1
|
|
||||||
// res: 3000 768 1 1
|
|
||||||
// used in whisper
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b,
|
||||||
|
int s0, // stride
|
||||||
|
int p0, // padding
|
||||||
|
int d0); // dilation
|
||||||
|
|
||||||
// used in whisper
|
GGML_API struct ggml_tensor * ggml_conv_2d(
|
||||||
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b,
|
||||||
|
int s0,
|
||||||
|
int s1,
|
||||||
|
int p0,
|
||||||
|
int p1,
|
||||||
|
int d0,
|
||||||
|
int d1);
|
||||||
|
|
||||||
// kernel size is a->ne[0] x a->ne[1]
|
// conv_1d with padding = half
|
||||||
// stride is equal to kernel size
|
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
||||||
// padding is zero
|
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
||||||
// example:
|
|
||||||
// a: 16 16 3 768
|
|
||||||
// b: 1024 1024 3 1
|
|
||||||
// res: 64 64 768 1
|
|
||||||
// used in sam
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b,
|
||||||
|
int s,
|
||||||
|
int d);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -1266,15 +1295,22 @@ extern "C" {
|
||||||
|
|
||||||
GGML_API void ggml_set_param(
|
GGML_API void ggml_set_param(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * tensor);
|
struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
||||||
|
|
||||||
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
|
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||||
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||||
|
GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
|
||||||
|
GGML_API void ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||||
|
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||||
|
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||||
|
GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||||
|
|
||||||
|
@ -1491,25 +1527,24 @@ extern "C" {
|
||||||
//
|
//
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
// restrict not standard in C++
|
// restrict not standard in C++
|
||||||
#define GGML_RESTRICT
|
#define GGML_RESTRICT
|
||||||
#else
|
#else
|
||||||
#define GGML_RESTRICT restrict
|
#define GGML_RESTRICT restrict
|
||||||
#endif
|
#endif
|
||||||
typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
||||||
typedef void (*quantize_row_q_t) (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
||||||
typedef void (*vec_dot_q_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
typedef void (*ggml_vec_dot_t) (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
dequantize_row_q_t dequantize_row_q;
|
ggml_to_float_t to_float;
|
||||||
quantize_row_q_t quantize_row_q;
|
ggml_from_float_t from_float;
|
||||||
quantize_row_q_t quantize_row_q_reference;
|
ggml_from_float_t from_float_reference;
|
||||||
quantize_row_q_t quantize_row_q_dot;
|
ggml_vec_dot_t vec_dot;
|
||||||
vec_dot_q_t vec_dot_q;
|
enum ggml_type vec_dot_type;
|
||||||
enum ggml_type vec_dot_type;
|
} ggml_type_traits_t;
|
||||||
} quantize_fns_t;
|
|
||||||
|
|
||||||
quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
|
ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type i);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
137
llama.cpp
137
llama.cpp
|
@ -79,6 +79,25 @@ void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
||||||
(void) tensor;
|
(void) tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// ggml helpers
|
||||||
|
//
|
||||||
|
|
||||||
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
|
||||||
|
if (plan.work_size > 0) {
|
||||||
|
buf.resize(plan.work_size);
|
||||||
|
plan.work_data = buf.data();
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_graph_compute(graph, &plan);
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// memory sizes
|
||||||
|
//
|
||||||
|
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
|
@ -321,6 +340,9 @@ struct llama_context {
|
||||||
// input embedding (1-dimensional array: [n_embd])
|
// input embedding (1-dimensional array: [n_embd])
|
||||||
std::vector<float> embedding;
|
std::vector<float> embedding;
|
||||||
|
|
||||||
|
// reusable buffer for `struct ggml_graph_plan.work_data`
|
||||||
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
// memory buffers used to evaluate the model
|
// memory buffers used to evaluate the model
|
||||||
// TODO: move in llama_state
|
// TODO: move in llama_state
|
||||||
llama_ctx_buffer buf_compute;
|
llama_ctx_buffer buf_compute;
|
||||||
|
@ -758,7 +780,6 @@ struct llama_model_loader {
|
||||||
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// kv cache
|
// kv cache
|
||||||
//
|
//
|
||||||
|
@ -1156,6 +1177,7 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
|
|
||||||
|
@ -1164,6 +1186,10 @@ static void llama_model_load_internal(
|
||||||
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
||||||
}
|
}
|
||||||
size_t vram_kv_cache = 0;
|
size_t vram_kv_cache = 0;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
const int max_backend_supported_layers = hparams.n_layer + 3;
|
||||||
|
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
||||||
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
||||||
if (low_vram) {
|
if (low_vram) {
|
||||||
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
||||||
|
@ -1180,14 +1206,18 @@ static void llama_model_load_internal(
|
||||||
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
||||||
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
||||||
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
|
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
||||||
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
||||||
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
||||||
#else
|
#else
|
||||||
(void) n_gpu_layers;
|
(void) n_gpu_layers;
|
||||||
#endif
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
}
|
}
|
||||||
|
|
||||||
// populate `tensors_by_name`
|
// populate `tensors_by_name`
|
||||||
|
@ -1256,17 +1286,11 @@ static bool llama_eval_internal(
|
||||||
const float * embd,
|
const float * embd,
|
||||||
const int n_tokens,
|
const int n_tokens,
|
||||||
const int n_past,
|
const int n_past,
|
||||||
const int n_threads,
|
int n_threads,
|
||||||
const char * cgraph_fname) {
|
const char * cgraph_fname) {
|
||||||
|
|
||||||
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
LLAMA_ASSERT((!tokens && embd) || (tokens && !embd));
|
||||||
|
|
||||||
// enforce that the first token is BOS
|
|
||||||
if (tokens && n_past == 0 && tokens[0] != llama_token_bos()) {
|
|
||||||
fprintf(stderr, "%s: first token must be BOS\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_time_us();
|
||||||
|
|
||||||
const int N = n_tokens;
|
const int N = n_tokens;
|
||||||
|
@ -1297,10 +1321,11 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
struct ggml_context * ctx0 = ggml_init(params);
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
|
||||||
|
ggml_cgraph gf = {};
|
||||||
|
|
||||||
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
||||||
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
|
||||||
ggml_cgraph gf = {};
|
n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
||||||
gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
|
|
||||||
|
|
||||||
struct ggml_tensor * cur;
|
struct ggml_tensor * cur;
|
||||||
struct ggml_tensor * inpL;
|
struct ggml_tensor * inpL;
|
||||||
|
@ -1584,6 +1609,7 @@ static bool llama_eval_internal(
|
||||||
|
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (lctx.ctx_metal && N == 1) {
|
if (lctx.ctx_metal && N == 1) {
|
||||||
|
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
|
||||||
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
ggml_metal_graph_compute(lctx.ctx_metal, &gf);
|
||||||
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
ggml_metal_get_tensor (lctx.ctx_metal, cur);
|
||||||
} else {
|
} else {
|
||||||
|
@ -1603,10 +1629,10 @@ static bool llama_eval_internal(
|
||||||
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (cgraph_fname) {
|
if (cgraph_fname) {
|
||||||
|
@ -1896,10 +1922,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
|
||||||
|
|
||||||
llama_sample_softmax(ctx, candidates);
|
llama_sample_softmax(ctx, candidates);
|
||||||
|
|
||||||
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
// Compute the cumulative probabilities
|
// Compute the cumulative probabilities
|
||||||
float cum_sum = 0.0f;
|
float cum_sum = 0.0f;
|
||||||
size_t last_idx = candidates->size;
|
size_t last_idx = candidates->size;
|
||||||
|
@ -1928,9 +1954,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
|
||||||
|
|
||||||
llama_sample_softmax(nullptr, candidates);
|
llama_sample_softmax(nullptr, candidates);
|
||||||
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
// Compute the first and second derivatives
|
// Compute the first and second derivatives
|
||||||
std::vector<float> first_derivatives(candidates->size - 1);
|
std::vector<float> first_derivatives(candidates->size - 1);
|
||||||
|
@ -1982,11 +2007,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
|
||||||
|
|
||||||
// Compute the softmax of logits and calculate entropy
|
// Compute the softmax of logits and calculate entropy
|
||||||
llama_sample_softmax(nullptr, candidates);
|
llama_sample_softmax(nullptr, candidates);
|
||||||
|
|
||||||
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
float entropy = 0.0f;
|
float entropy = 0.0f;
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
||||||
|
@ -2155,13 +2180,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
||||||
|
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
ctx->n_sample++;
|
|
||||||
}
|
}
|
||||||
return X;
|
return X;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
||||||
assert(ctx);
|
|
||||||
int64_t t_start_sample_us;
|
int64_t t_start_sample_us;
|
||||||
t_start_sample_us = ggml_time_us();
|
t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
|
@ -2176,13 +2199,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
||||||
candidates->size = 1;
|
candidates->size = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ctx) {
|
||||||
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
|
}
|
||||||
|
|
||||||
// Normalize the probabilities of the remaining words
|
// Normalize the probabilities of the remaining words
|
||||||
llama_sample_softmax(ctx, candidates);
|
llama_sample_softmax(ctx, candidates);
|
||||||
|
|
||||||
// Sample the next word X from the remaining words
|
// Sample the next word X from the remaining words
|
||||||
if (ctx) {
|
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
||||||
}
|
|
||||||
llama_token X = llama_sample_token(ctx, candidates);
|
llama_token X = llama_sample_token(ctx, candidates);
|
||||||
t_start_sample_us = ggml_time_us();
|
t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
|
@ -2250,10 +2274,10 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
||||||
}
|
}
|
||||||
float * f32_output = (float *) output.addr;
|
float * f32_output = (float *) output.addr;
|
||||||
|
|
||||||
quantize_fns_t qtype;
|
ggml_type_traits_t qtype;
|
||||||
if (ggml_is_quantized(tensor.type)) {
|
if (ggml_is_quantized(tensor.type)) {
|
||||||
qtype = ggml_internal_get_quantize_fn(tensor.type);
|
qtype = ggml_internal_get_type_traits(tensor.type);
|
||||||
if (qtype.dequantize_row_q == NULL) {
|
if (qtype.to_float == NULL) {
|
||||||
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor.type)));
|
||||||
}
|
}
|
||||||
} else if (tensor.type != GGML_TYPE_F16) {
|
} else if (tensor.type != GGML_TYPE_F16) {
|
||||||
|
@ -2264,7 +2288,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
||||||
if (tensor.type == GGML_TYPE_F16) {
|
if (tensor.type == GGML_TYPE_F16) {
|
||||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor.data, f32_output, nelements);
|
||||||
} else if (ggml_is_quantized(tensor.type)) {
|
} else if (ggml_is_quantized(tensor.type)) {
|
||||||
qtype.dequantize_row_q(tensor.data, f32_output, nelements);
|
qtype.to_float(tensor.data, f32_output, nelements);
|
||||||
} else {
|
} else {
|
||||||
LLAMA_ASSERT(false); // unreachable
|
LLAMA_ASSERT(false); // unreachable
|
||||||
}
|
}
|
||||||
|
@ -2289,7 +2313,7 @@ static void llama_convert_tensor_internal(const llama_load_tensor & tensor, llam
|
||||||
if (typ == GGML_TYPE_F16) {
|
if (typ == GGML_TYPE_F16) {
|
||||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
||||||
} else {
|
} else {
|
||||||
qtype.dequantize_row_q(inbuf, outbuf, nels);
|
qtype.to_float(inbuf, outbuf, nels);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
workers.push_back(std::thread(compute, tensor.type, tensor.data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
|
||||||
|
@ -2568,8 +2592,8 @@ void llama_free_model(struct llama_model * model) {
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_context * llama_new_context_with_model(
|
struct llama_context * llama_new_context_with_model(
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
struct llama_context_params params) {
|
struct llama_context_params params) {
|
||||||
|
|
||||||
if (!model) {
|
if (!model) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -2638,7 +2662,7 @@ struct llama_context * llama_new_context_with_model(
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
if (params.n_gpu_layers > 0) {
|
if (params.n_gpu_layers > 0) {
|
||||||
// this allocates all Metal resources and memory buffers
|
// this allocates all Metal resources and memory buffers
|
||||||
ctx->ctx_metal = ggml_metal_init();
|
ctx->ctx_metal = ggml_metal_init(1);
|
||||||
|
|
||||||
void * data_ptr = NULL;
|
void * data_ptr = NULL;
|
||||||
size_t data_size = 0;
|
size_t data_size = 0;
|
||||||
|
@ -2795,6 +2819,9 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
// read tensors and apply
|
// read tensors and apply
|
||||||
bool warned = false;
|
bool warned = false;
|
||||||
int n_tensors = 0;
|
int n_tensors = 0;
|
||||||
|
|
||||||
|
std::vector<uint8_t> work_buffer;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
int32_t n_dims;
|
int32_t n_dims;
|
||||||
int32_t length;
|
int32_t length;
|
||||||
|
@ -2959,8 +2986,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_build_forward(r);
|
struct ggml_cgraph gf = ggml_build_forward(r);
|
||||||
gf.n_threads = n_threads;
|
|
||||||
ggml_graph_compute(lora_ctx, &gf);
|
ggml_graph_compute_helper(work_buffer, &gf, n_threads);
|
||||||
|
|
||||||
// we won't need these tensors again, reset the context to save memory
|
// we won't need these tensors again, reset the context to save memory
|
||||||
ggml_free(lora_ctx);
|
ggml_free(lora_ctx);
|
||||||
|
@ -3113,7 +3140,6 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
|
|
||||||
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
||||||
ggml_cgraph gf{};
|
ggml_cgraph gf{};
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||||
kout3d->data = out;
|
kout3d->data = out;
|
||||||
|
@ -3133,7 +3159,7 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||||
ggml_graph_compute(cpy_ctx, &gf);
|
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
}
|
}
|
||||||
|
@ -3219,7 +3245,6 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
|
|
||||||
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
|
||||||
ggml_cgraph gf{};
|
ggml_cgraph gf{};
|
||||||
gf.n_threads = 1;
|
|
||||||
|
|
||||||
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
|
||||||
kin3d->data = (void *) inp;
|
kin3d->data = (void *) inp;
|
||||||
|
@ -3239,7 +3264,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
||||||
ggml_graph_compute(cpy_ctx, &gf);
|
ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
|
||||||
|
|
||||||
ggml_free(cpy_ctx);
|
ggml_free(cpy_ctx);
|
||||||
}
|
}
|
||||||
|
@ -3473,23 +3498,35 @@ llama_token llama_token_nl() {
|
||||||
return 13;
|
return 13;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
||||||
|
struct llama_timings result = {
|
||||||
|
/*.t_start_ms =*/ 1e-3 * ctx->t_start_us,
|
||||||
|
/*.t_end_ms =*/ 1.00 * ggml_time_ms(),
|
||||||
|
/*.t_load_ms =*/ 1e-3 * ctx->t_load_us,
|
||||||
|
/*.t_sample_ms =*/ 1e-3 * ctx->t_sample_us,
|
||||||
|
/*.t_p_eval_ms =*/ 1e-3 * ctx->t_p_eval_us,
|
||||||
|
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
||||||
|
|
||||||
|
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
||||||
|
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
|
||||||
|
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
||||||
|
};
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
void llama_print_timings(struct llama_context * ctx) {
|
void llama_print_timings(struct llama_context * ctx) {
|
||||||
const int64_t t_end_us = ggml_time_us();
|
const llama_timings timings = llama_get_timings(ctx);
|
||||||
|
|
||||||
const int32_t n_sample = std::max(1, ctx->n_sample);
|
|
||||||
const int32_t n_eval = std::max(1, ctx->n_eval);
|
|
||||||
const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
|
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
|
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, timings.t_load_ms);
|
||||||
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample, 1e6 / ctx->t_sample_us * n_sample);
|
__func__, timings.t_sample_ms, timings.n_sample, timings.t_sample_ms / timings.n_sample, 1e3 / timings.t_sample_ms * timings.n_sample);
|
||||||
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval, 1e6 / ctx->t_p_eval_us * n_p_eval);
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
||||||
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||||
__func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval, 1e6 / ctx->t_eval_us * n_eval);
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
||||||
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
|
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_reset_timings(struct llama_context * ctx) {
|
void llama_reset_timings(struct llama_context * ctx) {
|
||||||
|
|
15
llama.h
15
llama.h
|
@ -134,6 +134,20 @@ extern "C" {
|
||||||
bool quantize_output_tensor; // quantize output.weight
|
bool quantize_output_tensor; // quantize output.weight
|
||||||
} llama_model_quantize_params;
|
} llama_model_quantize_params;
|
||||||
|
|
||||||
|
// performance timing information
|
||||||
|
struct llama_timings {
|
||||||
|
double t_start_ms;
|
||||||
|
double t_end_ms;
|
||||||
|
double t_load_ms;
|
||||||
|
double t_sample_ms;
|
||||||
|
double t_p_eval_ms;
|
||||||
|
double t_eval_ms;
|
||||||
|
|
||||||
|
int32_t n_sample;
|
||||||
|
int32_t n_p_eval;
|
||||||
|
int32_t n_eval;
|
||||||
|
};
|
||||||
|
|
||||||
LLAMA_API struct llama_context_params llama_context_default_params();
|
LLAMA_API struct llama_context_params llama_context_default_params();
|
||||||
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
|
||||||
|
|
||||||
|
@ -331,6 +345,7 @@ extern "C" {
|
||||||
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
|
||||||
|
|
||||||
// Performance information
|
// Performance information
|
||||||
|
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
|
||||||
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
LLAMA_API void llama_print_timings(struct llama_context * ctx);
|
||||||
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
|
||||||
|
|
||||||
|
|
|
@ -136,7 +136,7 @@ int main(int argc, char** argv) {
|
||||||
|
|
||||||
auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
|
auto ggml_type = type == 0 ? GGML_TYPE_Q4_0 : GGML_TYPE_Q4_1;
|
||||||
|
|
||||||
auto funcs = ggml_internal_get_quantize_fn(ggml_type);
|
auto funcs = ggml_internal_get_type_traits(ggml_type);
|
||||||
|
|
||||||
Stat simple, ggml;
|
Stat simple, ggml;
|
||||||
|
|
||||||
|
@ -156,8 +156,8 @@ int main(int argc, char** argv) {
|
||||||
|
|
||||||
t1 = std::chrono::high_resolution_clock::now();
|
t1 = std::chrono::high_resolution_clock::now();
|
||||||
float fs;
|
float fs;
|
||||||
if (type == 0) funcs.vec_dot_q(kVecSize * QK4_1, &fs, x40.data(), y.data());
|
if (type == 0) funcs.vec_dot(kVecSize * QK4_1, &fs, x40.data(), y.data());
|
||||||
else funcs.vec_dot_q(kVecSize * QK4_1, &fs, x41.data(), y.data());
|
else funcs.vec_dot(kVecSize * QK4_1, &fs, x41.data(), y.data());
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
t = 1e-3*std::chrono::duration_cast<std::chrono::nanoseconds>(t2-t1).count();
|
||||||
if (iloop > 3) ggml.addResult(fs, t);
|
if (iloop > 3) ggml.addResult(fs, t);
|
||||||
|
|
|
@ -235,7 +235,7 @@ int main(int argc, char** argv) {
|
||||||
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
|
int n4 = useQ4_1 ? kVecSize / QK4_1 : kVecSize / QK4_0; n4 = 64*((n4 + 63)/64);
|
||||||
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
|
int n8 = kVecSize / QK8_0; n8 = 64*((n8 + 63)/64);
|
||||||
|
|
||||||
auto funcs = useQ4_1 ? ggml_internal_get_quantize_fn(GGML_TYPE_Q4_1) : ggml_internal_get_quantize_fn(GGML_TYPE_Q4_0);
|
auto funcs = useQ4_1 ? ggml_internal_get_type_traits(GGML_TYPE_Q4_1) : ggml_internal_get_type_traits(GGML_TYPE_Q4_0);
|
||||||
|
|
||||||
std::vector<block_q4_0> q40;
|
std::vector<block_q4_0> q40;
|
||||||
std::vector<block_q4_1> q41;
|
std::vector<block_q4_1> q41;
|
||||||
|
@ -261,9 +261,9 @@ int main(int argc, char** argv) {
|
||||||
// Note, we do not include this in the timing as in practical application
|
// Note, we do not include this in the timing as in practical application
|
||||||
// we already have the quantized model weights.
|
// we already have the quantized model weights.
|
||||||
if (useQ4_1) {
|
if (useQ4_1) {
|
||||||
funcs.quantize_row_q(x1.data(), q41.data(), kVecSize);
|
funcs.from_float(x1.data(), q41.data(), kVecSize);
|
||||||
} else {
|
} else {
|
||||||
funcs.quantize_row_q(x1.data(), q40.data(), kVecSize);
|
funcs.from_float(x1.data(), q40.data(), kVecSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Now measure time the dot product needs using the "scalar" version above
|
// Now measure time the dot product needs using the "scalar" version above
|
||||||
|
@ -282,9 +282,10 @@ int main(int argc, char** argv) {
|
||||||
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
|
dot_q4_q8(kVecSize, &result, q40.data(), q8.data());
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
funcs.quantize_row_q_dot(y1.data(), q8.data(), kVecSize);
|
auto vdot = ggml_internal_get_type_traits(funcs.vec_dot_type);
|
||||||
if (useQ4_1) funcs.vec_dot_q(kVecSize, &result, q41.data(), q8.data());
|
vdot.from_float(y1.data(), q8.data(), kVecSize);
|
||||||
else funcs.vec_dot_q(kVecSize, &result, q40.data(), q8.data());
|
if (useQ4_1) funcs.vec_dot(kVecSize, &result, q41.data(), q8.data());
|
||||||
|
else funcs.vec_dot(kVecSize, &result, q40.data(), q8.data());
|
||||||
}
|
}
|
||||||
sumq += result;
|
sumq += result;
|
||||||
t2 = std::chrono::high_resolution_clock::now();
|
t2 = std::chrono::high_resolution_clock::now();
|
||||||
|
|
|
@ -1,6 +1,14 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
cp -rpv ../ggml/src/ggml.c ./ggml.c
|
cp -rpv ../ggml/src/ggml.c ./ggml.c
|
||||||
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
||||||
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
||||||
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
||||||
|
cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
|
||||||
|
cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
|
||||||
|
cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
|
||||||
|
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
|
||||||
|
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
||||||
|
|
||||||
|
cp -rpv ../ggml/tests/test-opt.c ./tests/test-opt.c
|
||||||
|
cp -rpv ../ggml/tests/test-grad0.c ./tests/test-grad0.c
|
||||||
|
|
|
@ -10,5 +10,5 @@ llama_add_test(test-quantize-fns.cpp)
|
||||||
llama_add_test(test-quantize-perf.cpp)
|
llama_add_test(test-quantize-perf.cpp)
|
||||||
llama_add_test(test-sampling.cpp)
|
llama_add_test(test-sampling.cpp)
|
||||||
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
||||||
# llama_add_test(test-grad0.c) # SLOW
|
llama_add_test(test-grad0.c) # SLOW
|
||||||
# llama_add_test(test-opt.c) # SLOW
|
# llama_add_test(test-opt.c) # SLOW
|
||||||
|
|
|
@ -10,6 +10,8 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||||
|
|
||||||
#define MAX_NARGS 3
|
#define MAX_NARGS 3
|
||||||
|
|
||||||
#undef MIN
|
#undef MIN
|
||||||
|
@ -49,7 +51,7 @@ float frand(void) {
|
||||||
|
|
||||||
int irand(int n) {
|
int irand(int n) {
|
||||||
if (n == 0) return 0;
|
if (n == 0) return 0;
|
||||||
else return rand()%n;
|
return rand()%n;
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_random_dims(int64_t * dims, int ndims) {
|
void get_random_dims(int64_t * dims, int ndims) {
|
||||||
|
@ -159,12 +161,14 @@ struct ggml_tensor * get_random_tensor_int(
|
||||||
float get_element(const struct ggml_tensor * t, int idx) {
|
float get_element(const struct ggml_tensor * t, int idx) {
|
||||||
if (t->type == GGML_TYPE_F32) {
|
if (t->type == GGML_TYPE_F32) {
|
||||||
return ((float *)t->data)[idx];
|
return ((float *)t->data)[idx];
|
||||||
} else if (t->type == GGML_TYPE_I32) {
|
|
||||||
return ((int32_t *)t->data)[idx];
|
|
||||||
} else {
|
|
||||||
assert(false);
|
|
||||||
return INFINITY;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (t->type == GGML_TYPE_I32) {
|
||||||
|
return ((int32_t *)t->data)[idx];
|
||||||
|
}
|
||||||
|
|
||||||
|
assert(false);
|
||||||
|
return INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_element(struct ggml_tensor * t, int idx, float value) {
|
void set_element(struct ggml_tensor * t, int idx, float value) {
|
||||||
|
@ -215,15 +219,14 @@ bool check_gradient(
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph gf = ggml_build_forward (f);
|
struct ggml_cgraph gf = ggml_build_forward (f);
|
||||||
gf.n_threads = n_threads;
|
|
||||||
|
|
||||||
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
struct ggml_cgraph gb = ggml_build_backward(ctx0, &gf, false);
|
||||||
gb.n_threads = n_threads;
|
|
||||||
|
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||||
|
|
||||||
ggml_graph_reset (&gf);
|
ggml_graph_reset (&gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
ggml_graph_compute(ctx0, &gb);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
|
||||||
|
|
||||||
// ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
|
// ggml_graph_dump_dot(&gf, NULL, "test-grad0-forward.dot");
|
||||||
// ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot");
|
// ggml_graph_dump_dot(&gb, &gf, "test-grad0-backward.dot");
|
||||||
|
@ -236,15 +239,16 @@ bool check_gradient(
|
||||||
const float xm = x0 - eps;
|
const float xm = x0 - eps;
|
||||||
const float xp = x0 + eps;
|
const float xp = x0 + eps;
|
||||||
set_element(x[i], k, xp);
|
set_element(x[i], k, xp);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||||
|
|
||||||
const float f0 = ggml_get_f32_1d(f, 0);
|
const float f0 = ggml_get_f32_1d(f, 0);
|
||||||
|
|
||||||
set_element(x[i], k, xm);
|
set_element(x[i], k, xm);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
|
||||||
|
|
||||||
const float f1 = ggml_get_f32_1d(f, 0);
|
const float f1 = ggml_get_f32_1d(f, 0);
|
||||||
|
|
||||||
const float g0 = (f0 - f1)/(2.0f*eps);
|
const float g0 = (f0 - f1)/(2.0f*eps);
|
||||||
|
|
||||||
set_element(x[i], k, x0);
|
set_element(x[i], k, x0);
|
||||||
|
@ -252,12 +256,13 @@ bool check_gradient(
|
||||||
// compute gradient using backward graph
|
// compute gradient using backward graph
|
||||||
ggml_graph_reset (&gf);
|
ggml_graph_reset (&gf);
|
||||||
ggml_set_f32 (f->grad, 1.0f);
|
ggml_set_f32 (f->grad, 1.0f);
|
||||||
ggml_graph_compute(ctx0, &gb);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx0, &gb, n_threads);
|
||||||
|
|
||||||
const float g1 = get_element(x[i]->grad, k);
|
const float g1 = get_element(x[i]->grad, k);
|
||||||
|
|
||||||
const float error_abs = fabsf(g0 - g1);
|
const float error_abs = fabsf(g0 - g1);
|
||||||
const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabs(g0) : 0;
|
const float error_rel = g0 != 0 ? fabsf(g0 - g1)/fabsf(g0) : 0;
|
||||||
|
|
||||||
if (error_abs > max_error_abs || error_rel > max_error_rel) {
|
if (error_abs > max_error_abs || error_rel > max_error_rel) {
|
||||||
printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
|
printf("%s: ndims=%d, i=%d, k=%d, x0=%f, xm=%f, xp=%f, f0=%f, f1=%f, g0=%f, g1=%f, eps=%f, error_abs=%f, error_rel=%f\n",
|
||||||
|
@ -1154,7 +1159,7 @@ int main(int argc, const char ** argv) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode));
|
struct ggml_tensor * f = ggml_sum(ctx0, ggml_rope(ctx0, x[0], n_past, n_rot, mode, 0));
|
||||||
|
|
||||||
GGML_PRINT_DEBUG("rope: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
|
GGML_PRINT_DEBUG("rope: n_past: %d n_rot: %d mode: %d\n", n_past, n_rot, mode);
|
||||||
check_gradient("rope", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
|
check_gradient("rope", ctx0, x, f, ndims, nargs, 1e-2f, 1e-3f, INFINITY);
|
||||||
|
|
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
#define MAX_NARGS 2
|
#define MAX_NARGS 2
|
||||||
|
|
||||||
|
#pragma GCC diagnostic ignored "-Wdouble-promotion"
|
||||||
|
|
||||||
//
|
//
|
||||||
// logging
|
// logging
|
||||||
|
@ -33,7 +34,7 @@
|
||||||
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
||||||
|
|
||||||
|
|
||||||
float frand() {
|
float frand(void) {
|
||||||
return (float)rand()/(float)RAND_MAX;
|
return (float)rand()/(float)RAND_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -114,7 +115,7 @@ void set_element(struct ggml_tensor * t, int idx, float value) {
|
||||||
((float *)t->data)[idx] = value;
|
((float *)t->data)[idx] = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, const char ** argv) {
|
int main(void) {
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
.mem_size = 1024*1024*1024,
|
.mem_size = 1024*1024*1024,
|
||||||
.mem_buffer = NULL,
|
.mem_buffer = NULL,
|
||||||
|
@ -137,10 +138,11 @@ int main(int argc, const char ** argv) {
|
||||||
struct ggml_tensor * d = ggml_sub(ctx, c, ab);
|
struct ggml_tensor * d = ggml_sub(ctx, c, ab);
|
||||||
struct ggml_tensor * e = ggml_sum(ctx, ggml_sqr(ctx, d));
|
struct ggml_tensor * e = ggml_sum(ctx, ggml_sqr(ctx, d));
|
||||||
|
|
||||||
|
|
||||||
struct ggml_cgraph ge = ggml_build_forward(e);
|
struct ggml_cgraph ge = ggml_build_forward(e);
|
||||||
ggml_graph_reset (&ge);
|
ggml_graph_reset(&ge);
|
||||||
ggml_graph_compute(ctx, &ge);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
|
||||||
|
|
||||||
const float fe = ggml_get_f32_1d(e, 0);
|
const float fe = ggml_get_f32_1d(e, 0);
|
||||||
printf("%s: e = %.4f\n", __func__, fe);
|
printf("%s: e = %.4f\n", __func__, fe);
|
||||||
|
|
||||||
|
@ -148,8 +150,10 @@ int main(int argc, const char ** argv) {
|
||||||
|
|
||||||
ggml_opt(ctx, opt_params, e);
|
ggml_opt(ctx, opt_params, e);
|
||||||
|
|
||||||
ggml_graph_reset (&ge);
|
ggml_graph_reset(&ge);
|
||||||
ggml_graph_compute(ctx, &ge);
|
|
||||||
|
ggml_graph_compute_with_ctx(ctx, &ge, /*n_threads*/ 1);
|
||||||
|
|
||||||
const float fe_opt = ggml_get_f32_1d(e, 0);
|
const float fe_opt = ggml_get_f32_1d(e, 0);
|
||||||
printf("%s: original e = %.4f\n", __func__, fe);
|
printf("%s: original e = %.4f\n", __func__, fe);
|
||||||
printf("%s: optimized e = %.4f\n", __func__, fe_opt);
|
printf("%s: optimized e = %.4f\n", __func__, fe_opt);
|
||||||
|
|
|
@ -40,26 +40,26 @@ float array_rmse(const float * a1, const float * a2, size_t n) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Total quantization error on test data
|
// Total quantization error on test data
|
||||||
float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
|
float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
||||||
std::vector<uint8_t> tmp_q(2*test_size);
|
std::vector<uint8_t> tmp_q(2*test_size);
|
||||||
std::vector<float> tmp_out(test_size);
|
std::vector<float> tmp_out(test_size);
|
||||||
|
|
||||||
qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
|
qfns.from_float(test_data, tmp_q.data(), test_size);
|
||||||
qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
|
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||||
return array_rmse(test_data, tmp_out.data(), test_size);
|
return array_rmse(test_data, tmp_out.data(), test_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Total quantization error on test data
|
// Total quantization error on test data
|
||||||
float reference_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
|
float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
||||||
std::vector<uint8_t> tmp_q(2*test_size);
|
std::vector<uint8_t> tmp_q(2*test_size);
|
||||||
std::vector<float> tmp_out(test_size);
|
std::vector<float> tmp_out(test_size);
|
||||||
std::vector<float> tmp_out_ref(test_size);
|
std::vector<float> tmp_out_ref(test_size);
|
||||||
|
|
||||||
qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
|
qfns.from_float(test_data, tmp_q.data(), test_size);
|
||||||
qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
|
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
|
||||||
|
|
||||||
qfns.quantize_row_q_reference(test_data, tmp_q.data(), test_size);
|
qfns.from_float_reference(test_data, tmp_q.data(), test_size);
|
||||||
qfns.dequantize_row_q(tmp_q.data(), tmp_out_ref.data(), test_size);
|
qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
|
||||||
|
|
||||||
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
||||||
}
|
}
|
||||||
|
@ -73,15 +73,17 @@ float dot_product(const float * a1, const float * a2, size_t test_size) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Total dot product error
|
// Total dot product error
|
||||||
float dot_product_error(quantize_fns_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
|
float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
|
||||||
std::vector<uint8_t> tmp_q1(2*test_size);
|
std::vector<uint8_t> tmp_q1(2*test_size);
|
||||||
std::vector<uint8_t> tmp_q2(2*test_size);
|
std::vector<uint8_t> tmp_q2(2*test_size);
|
||||||
|
|
||||||
qfns.quantize_row_q (test_data1, tmp_q1.data(), test_size);
|
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
|
||||||
qfns.quantize_row_q_dot(test_data2, tmp_q2.data(), test_size);
|
|
||||||
|
qfns.from_float(test_data1, tmp_q1.data(), test_size);
|
||||||
|
vdot.from_float(test_data2, tmp_q2.data(), test_size);
|
||||||
|
|
||||||
float result = INFINITY;
|
float result = INFINITY;
|
||||||
qfns.vec_dot_q(test_size, &result, tmp_q1.data(), tmp_q2.data());
|
qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data());
|
||||||
|
|
||||||
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
const float dot_ref = dot_product(test_data1, test_data2, test_size);
|
||||||
|
|
||||||
|
@ -123,9 +125,9 @@ int main(int argc, char * argv[]) {
|
||||||
|
|
||||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||||
ggml_type type = (ggml_type) i;
|
ggml_type type = (ggml_type) i;
|
||||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
||||||
|
|
||||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
if (qfns.from_float && qfns.to_float) {
|
||||||
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
|
||||||
const float max_quantization_error =
|
const float max_quantization_error =
|
||||||
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
|
||||||
|
|
|
@ -123,9 +123,9 @@ void usage(char * argv[]) {
|
||||||
printf(" --type TYPE set test type as");
|
printf(" --type TYPE set test type as");
|
||||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||||
ggml_type type = (ggml_type) i;
|
ggml_type type = (ggml_type) i;
|
||||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
|
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
||||||
if (ggml_type_name(type) != NULL) {
|
if (ggml_type_name(type) != NULL) {
|
||||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
if (qfns.from_float && qfns.to_float) {
|
||||||
printf(" %s", ggml_type_name(type));
|
printf(" %s", ggml_type_name(type));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -271,12 +271,12 @@ int main(int argc, char * argv[]) {
|
||||||
|
|
||||||
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
|
||||||
ggml_type type = (ggml_type) i;
|
ggml_type type = (ggml_type) i;
|
||||||
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
|
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
||||||
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
|
if (qfns.from_float && qfns.to_float) {
|
||||||
printf("%s\n", ggml_type_name(type));
|
printf("%s\n", ggml_type_name(type));
|
||||||
|
|
||||||
if (params.op_quantize_row_q_reference) {
|
if (params.op_quantize_row_q_reference) {
|
||||||
|
@ -284,7 +284,7 @@ int main(int argc, char * argv[]) {
|
||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void ) {
|
auto quantize_fn = [&](void ) {
|
||||||
qfns.quantize_row_q_reference(test_data1, test_q1, size);
|
qfns.from_float_reference(test_data1, test_q1, size);
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||||
|
@ -298,7 +298,7 @@ int main(int argc, char * argv[]) {
|
||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void ) {
|
auto quantize_fn = [&](void ) {
|
||||||
qfns.quantize_row_q(test_data1, test_q1, size);
|
qfns.from_float(test_data1, test_q1, size);
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||||
|
@ -309,11 +309,11 @@ int main(int argc, char * argv[]) {
|
||||||
|
|
||||||
if (params.op_dequantize_row_q) {
|
if (params.op_dequantize_row_q) {
|
||||||
printf(" dequantize_row_q\n");
|
printf(" dequantize_row_q\n");
|
||||||
qfns.quantize_row_q(test_data1, test_q1, largest);
|
qfns.from_float(test_data1, test_q1, largest);
|
||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void ) {
|
auto quantize_fn = [&](void ) {
|
||||||
qfns.dequantize_row_q(test_q1, test_out, size);
|
qfns.to_float(test_q1, test_out, size);
|
||||||
return test_out[0];
|
return test_out[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||||
|
@ -327,7 +327,8 @@ int main(int argc, char * argv[]) {
|
||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void ) {
|
auto quantize_fn = [&](void ) {
|
||||||
qfns.quantize_row_q_dot(test_data1, test_q1, size);
|
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
|
||||||
|
vdot.from_float(test_data1, test_q1, size);
|
||||||
return test_q1[0];
|
return test_q1[0];
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||||
|
@ -338,13 +339,13 @@ int main(int argc, char * argv[]) {
|
||||||
|
|
||||||
if (params.op_vec_dot_q) {
|
if (params.op_vec_dot_q) {
|
||||||
printf(" vec_dot_q\n");
|
printf(" vec_dot_q\n");
|
||||||
qfns.quantize_row_q(test_data1, test_q1, largest);
|
qfns.from_float(test_data1, test_q1, largest);
|
||||||
qfns.quantize_row_q(test_data2, test_q2, largest);
|
qfns.from_float(test_data2, test_q2, largest);
|
||||||
for (size_t size : params.test_sizes) {
|
for (size_t size : params.test_sizes) {
|
||||||
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
|
||||||
auto quantize_fn = [&](void ) {
|
auto quantize_fn = [&](void ) {
|
||||||
float result;
|
float result;
|
||||||
qfns.vec_dot_q(size, &result, test_q1, test_q2);
|
qfns.vec_dot(size, &result, test_q1, test_q2);
|
||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue