Merge branch 'master' into master

This commit is contained in:
Rickard Hallerbäck 2023-09-15 15:14:56 +02:00 committed by GitHub
commit e004116de9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
34 changed files with 2029 additions and 470 deletions

22
.devops/cloud-v-pipeline Normal file
View file

@ -0,0 +1,22 @@
node('x86_runner1'){ // Running on x86 runner containing latest vector qemu, latest vector gcc and all the necessary libraries
stage('Cleanup'){
cleanWs() // Cleaning previous CI build in workspace
}
stage('checkout repo'){
retry(5){ // Retry if the cloning fails due to some reason
checkout scm // Clone the repo on Runner
}
}
stage('Compiling llama.cpp'){
sh'''#!/bin/bash
make RISCV=1 RISCV_CROSS_COMPILE=1 # Compiling llama for RISC-V
'''
}
stage('Running llama.cpp'){
sh'''#!/bin/bash
module load gnu-bin2/0.1 # loading latest versions of vector qemu and vector gcc
qemu-riscv64 -L /softwares/gnu-bin2/sysroot -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt # Running llama.cpp on vector qemu-riscv64
cat llama_log.txt # Printing results
'''
}
}

View file

@ -27,7 +27,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v3
- name: Dependencies
id: depends
@ -52,7 +52,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v3
- name: Dependencies
id: depends
@ -87,7 +87,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v3
- name: Dependencies
id: depends
@ -121,7 +121,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v3
- name: Dependencies
id: depends
@ -149,7 +149,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v3
- name: Dependencies
id: depends
@ -174,7 +174,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v3
- name: Dependencies
id: depends
@ -280,7 +280,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v3
- name: Download OpenCL SDK
id: get_opencl
@ -390,20 +390,19 @@ jobs:
strategy:
matrix:
cuda: ['12.1.0', '11.7.1']
cuda: ['12.2.0', '11.7.1']
build: ['cublas']
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v3
- uses: Jimver/cuda-toolkit@v0.2.10
- uses: Jimver/cuda-toolkit@v0.2.11
id: cuda-toolkit
with:
cuda: ${{ matrix.cuda }}
# TODO(green-sky): _dev seems to fail, and non dev are not enought
#sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Build
id: cmake_build
@ -440,27 +439,11 @@ jobs:
llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
- name: Copy and pack Cuda runtime
if: ${{ matrix.cuda == '12.1.0' }}
# TODO(green-sky): paths are cuda 12 specific
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
mkdir '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
- name: Copy and pack Cuda runtime
if: ${{ matrix.cuda == '11.7.1' }}
# TODO(green-sky): paths are cuda 11 specific
run: |
echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
mkdir '.\build\bin\cudart\'
ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
$dst='.\build\bin\cudart\'
robocopy "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin" $dst cudart64_*.dll cublas64_*.dll cublasLt64_*.dll
7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip $dst\*
- name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -469,6 +452,22 @@ jobs:
path: |
cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
freeBSD-latest:
runs-on: macos-12
steps:
- name: Clone
uses: actions/checkout@v3
- name: Build
uses: cross-platform-actions/action@v0.19.0
with:
operating_system: freebsd
version: '13.2'
run: |
sudo pkg update
sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15
release:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
@ -485,7 +484,7 @@ jobs:
steps:
- name: Clone
id: checkout
uses: actions/checkout@v1
uses: actions/checkout@v3
- name: Determine tag name
id: tag
@ -543,7 +542,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v1
# uses: actions/checkout@v3
#
# - name: Dependencies
# run: |
@ -567,7 +566,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v1
# uses: actions/checkout@v3
#
# - name: Dependencies
# run: |
@ -591,7 +590,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v1
# uses: actions/checkout@v3
#
# - name: Dependencies
# run: |
@ -621,7 +620,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v1
# uses: actions/checkout@v3
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1
@ -660,7 +659,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v1
# uses: actions/checkout@v3
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v1
@ -706,7 +705,7 @@ jobs:
#
# steps:
# - name: Clone
# uses: actions/checkout@v1
# uses: actions/checkout@v3
#
# - name: Dependencies
# run: |

View file

@ -26,8 +26,15 @@ jobs:
strategy:
matrix:
config:
- { tag: "light", dockerfile: ".devops/main.Dockerfile" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile" }
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
# have disabled them for now until the reason why
# is understood.
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
steps:
- name: Check out the repo
uses: actions/checkout@v3
@ -51,7 +58,7 @@ jobs:
with:
context: .
push: true
platforms: linux/amd64,linux/arm64
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
@ -60,6 +67,6 @@ jobs:
with:
context: .
push: ${{ github.event_name == 'push' }}
platforms: linux/amd64,linux/arm64
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}

View file

@ -24,7 +24,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v2
with:

View file

@ -135,6 +135,7 @@ set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
include(CheckCXXCompilerFlag)
if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD)
@ -171,8 +172,8 @@ if (LLAMA_METAL)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
message(STATUS "Metal framework found")
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
set(GGML_HEADERS_METAL ggml-metal.h)
set(GGML_SOURCES_METAL ggml-metal.m)
add_compile_definitions(GGML_USE_METAL)
if (LLAMA_METAL_NDEBUG)
@ -191,7 +192,6 @@ if (LLAMA_METAL)
${METALKIT_FRAMEWORK}
)
endif()
if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
@ -268,7 +268,8 @@ if (LLAMA_BLAS)
endif()
if (LLAMA_K_QUANTS)
set(GGML_SOURCES_EXTRA ${GGML_SOURCES_EXTRA} k_quants.c k_quants.h)
set(GGML_HEADERS_EXTRA k_quants.h)
set(GGML_SOURCES_EXTRA k_quants.c)
add_compile_definitions(GGML_USE_K_QUANTS)
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
@ -284,7 +285,8 @@ if (LLAMA_CUBLAS)
enable_language(CUDA)
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
set(GGML_HEADERS_CUDA ggml-cuda.h)
set(GGML_SOURCES_CUDA ggml-cuda.cu)
add_compile_definitions(GGML_USE_CUBLAS)
# if (LLAMA_CUDA_CUBLAS)
@ -332,6 +334,7 @@ if (LLAMA_MPI)
find_package(MPI)
if (MPI_C_FOUND)
message(STATUS "MPI found")
set(GGML_HEADERS_MPI ggml-mpi.h)
set(GGML_SOURCES_MPI ggml-mpi.c ggml-mpi.h)
add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
@ -354,7 +357,8 @@ if (LLAMA_CLBLAST)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")
set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
set(GGML_HEADERS_OPENCL ggml-opencl.h)
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
add_compile_definitions(GGML_USE_CLBLAST)
@ -382,13 +386,15 @@ if (LLAMA_HIPBLAS)
message(STATUS "HIP and hipBLAS found")
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
if (BUILD_SHARED_LIBS)
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
if (LLAMA_CUDA_FORCE_DMMV)
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
endif()
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
target_compile_definitions(ggml-rocm PRIVATE CC_TURING=1000000000)
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
@ -439,7 +445,7 @@ if (LLAMA_ALL_WARNINGS)
endif()
if (MSVC)
if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
if (BUILD_SHARED_LIBS)
@ -461,6 +467,13 @@ endif()
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (MSVC)
string(TOLOWER "${CMAKE_GENERATOR_PLATFORM}" CMAKE_GENERATOR_PLATFORM_LWR)
message(STATUS "CMAKE_GENERATOR_PLATFORM: ${CMAKE_GENERATOR_PLATFORM}")
else ()
set(CMAKE_GENERATOR_PLATFORM_LWR "")
endif ()
if (NOT MSVC)
if (LLAMA_STATIC)
add_link_options(-static)
@ -476,25 +489,33 @@ if (NOT MSVC)
endif()
endif()
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64"))
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
message(STATUS "ARM detected")
if (MSVC)
# TODO: arm msvc?
add_compile_definitions(__ARM_NEON)
add_compile_definitions(__ARM_FEATURE_FMA)
add_compile_definitions(__ARM_FEATURE_DOTPROD)
# add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
else()
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
add_compile_options(-mfp16-format=ieee)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
# Raspberry Pi 1, Zero
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access)
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
# Raspberry Pi 2
add_compile_options(-mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations)
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
# Raspberry Pi 3, 4, Zero 2 (32-bit)
add_compile_options(-mfp16-format=ieee -mno-unaligned-access)
add_compile_options(-mno-unaligned-access)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
message(STATUS "x86 detected")
if (MSVC)
if (LLAMA_AVX512)
@ -616,11 +637,11 @@ add_library(ggml OBJECT
ggml.h
ggml-alloc.c
ggml-alloc.h
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
${GGML_SOURCES_METAL}
${GGML_SOURCES_MPI}
${GGML_SOURCES_EXTRA}
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
)
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
@ -658,14 +679,53 @@ if (BUILD_SHARED_LIBS)
if (LLAMA_METAL)
set_target_properties(llama PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
endif()
install(TARGETS llama LIBRARY)
endif()
#
# install
#
include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
set(LLAMA_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}
CACHE PATH "Location of header files")
set(LLAMA_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR}
CACHE PATH "Location of library files")
set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}
CACHE PATH "Location of binary files")
set(LLAMA_BUILD_NUMBER ${BUILD_NUMBER})
set(LLAMA_BUILD_COMMIT ${BUILD_COMMIT})
set(LLAMA_INSTALL_VERSION 0.0.${BUILD_NUMBER})
configure_package_config_file(
${CMAKE_CURRENT_SOURCE_DIR}/scripts/LlamaConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama
PATH_VARS LLAMA_INCLUDE_INSTALL_DIR
LLAMA_LIB_INSTALL_DIR
LLAMA_BIN_INSTALL_DIR )
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
VERSION ${LLAMA_INSTALL_VERSION}
COMPATIBILITY SameMajorVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
set(GGML_PUBLIC_HEADERS "ggml.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER)
set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/llama.h)
install(TARGETS llama LIBRARY PUBLIC_HEADER)
install(
FILES convert.py
PERMISSIONS

View file

@ -2,7 +2,7 @@
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
# Binaries only useful for tests
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama
# Code coverage output files
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -49,7 +49,7 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1" ]; then \
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
continue; \
else \
echo "Running test $$test_target..."; \
@ -110,50 +110,42 @@ MK_LDFLAGS =
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
MK_CFLAGS += -D_XOPEN_SOURCE=600
MK_CXXFLAGS += -D_XOPEN_SOURCE=600
MK_CPPFLAGS += -D_XOPEN_SOURCE=600
# Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher
ifeq ($(UNAME_S),OpenBSD)
MK_CFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
MK_CXXFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
MK_CPPFLAGS += -U_XOPEN_SOURCE -D_XOPEN_SOURCE=700
endif
# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
ifeq ($(UNAME_S),Linux)
MK_CFLAGS += -D_GNU_SOURCE
MK_CXXFLAGS += -D_GNU_SOURCE
MK_CPPFLAGS += -D_GNU_SOURCE
endif
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary
ifeq ($(UNAME_S),Darwin)
MK_CFLAGS += -D_DARWIN_C_SOURCE
MK_CXXFLAGS += -D_DARWIN_C_SOURCE
MK_CPPFLAGS += -D_DARWIN_C_SOURCE
endif
ifeq ($(UNAME_S),DragonFly)
MK_CFLAGS += -D__BSD_VISIBLE
MK_CXXFLAGS += -D__BSD_VISIBLE
MK_CPPFLAGS += -D__BSD_VISIBLE
endif
# alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases
ifeq ($(UNAME_S),FreeBSD)
MK_CFLAGS += -D__BSD_VISIBLE
MK_CXXFLAGS += -D__BSD_VISIBLE
MK_CPPFLAGS += -D__BSD_VISIBLE
endif
ifeq ($(UNAME_S),NetBSD)
MK_CFLAGS += -D_NETBSD_SOURCE
MK_CXXFLAGS += -D_NETBSD_SOURCE
MK_CPPFLAGS += -D_NETBSD_SOURCE
endif
ifeq ($(UNAME_S),OpenBSD)
MK_CFLAGS += -D_BSD_SOURCE
MK_CXXFLAGS += -D_BSD_SOURCE
MK_CPPFLAGS += -D_BSD_SOURCE
endif
ifdef LLAMA_DEBUG
@ -182,7 +174,7 @@ MK_CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow
-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
MK_CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
ifeq '' '$(findstring clang++,$(CXX))'
ifeq '' '$(findstring clang,$(shell $(CXX) --version))'
# g++ only
MK_CXXFLAGS += -Wno-format-truncation -Wno-array-bounds
endif
@ -408,7 +400,6 @@ ifdef LLAMA_HIPBLAS
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
HIPFLAGS += -DCC_TURING=1000000000
ifdef LLAMA_CUDA_FORCE_DMMV
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # LLAMA_CUDA_FORCE_DMMV
@ -606,7 +597,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1: tests/test-tokenizer-1.cpp build-info.h ggml.o llama.o common.o $(OBJS)
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-c.o: tests/test-c.c llama.h

View file

@ -844,8 +844,17 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
#### Images
We have two Docker images available for this project:
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
Additionally, there the following images, similar to the above:
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the Gitlab Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
#### Usage

View file

@ -374,6 +374,17 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers_draft = std::stoi(argv[i]);
#else
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
#endif
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
@ -423,8 +434,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
#endif // GGML_USE_CUBLAS
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--mtest") {
params.mem_test = true;
} else if (arg == "--numa") {
params.numa = true;
} else if (arg == "--export") {
@ -664,6 +673,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
printf(" -ngl N, --n-gpu-layers N\n");
printf(" number of layers to store in VRAM\n");
printf(" -ngld N, --n-gpu-layers-draft N\n");
printf(" number of layers to store in VRAM for the draft model\n");
printf(" -ts SPLIT --tensor-split SPLIT\n");
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
@ -674,7 +685,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" Not recommended since this is both slower and uses more VRAM.\n");
#endif // GGML_USE_CUBLAS
#endif
printf(" --mtest compute maximum memory usage\n");
printf(" --export export the computation graph to 'llama.ggml'\n");
printf(" --verbose-prompt print prompt before generation\n");
fprintf(stderr, " --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
@ -1212,7 +1222,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);

View file

@ -38,6 +38,7 @@ struct gpt_params {
int32_t n_draft = 16; // number of tokens to draft during speculative decoding
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
@ -109,7 +110,6 @@ struct gpt_params {
bool perplexity = false; // compute perplexity over the prompt
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
bool mem_test = false; // compute maximum memory usage
bool numa = false; // attempt optimizations that help on some NUMA systems
bool export_cgraph = false; // export the computation graph
bool verbose_prompt = false; // print prompt tokens before generation

292
convert-baichuan-hf-to-gguf.py Executable file
View file

@ -0,0 +1,292 @@
#!/usr/bin/env python3
# HF baichuan --> gguf conversion
from __future__ import annotations
import argparse
import json
import os
import struct
import sys
from pathlib import Path
from typing import TYPE_CHECKING, Any
import itertools
import gguf
import numpy as np
import torch
from sentencepiece import SentencePieceProcessor # type: ignore[import]
if TYPE_CHECKING:
from typing import TypeAlias
NDArray: TypeAlias = 'np.ndarray[Any, Any]'
# reverse HF permute back to original pth layout
def reverse_hf_permute(weights: NDArray, n_head: int, n_kv_head: int | None = None) -> NDArray:
if n_kv_head is not None and n_head != n_kv_head:
n_head //= n_kv_head
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
.swapaxes(1, 2)
.reshape(weights.shape))
def reverse_hf_permute_part(weights: NDArray, n_part: int, n_head: int, n_head_kv: int| None = None) -> NDArray:
r = weights.shape[0] // 3
return (reverse_hf_permute(weights[r * n_part : r * n_part + r, ...], n_head, n_head_kv))
def reverse_hf_part(weights: NDArray, n_part: int) -> NDArray:
r = weights.shape[0] // 3
return weights[r * n_part : r * n_part + r, ...]
def count_model_parts(dir_model: str) -> int:
num_parts = 0
for filename in os.listdir(dir_model):
if filename.startswith("pytorch_model-"):
num_parts += 1
if num_parts > 0:
print("gguf: found " + str(num_parts) + " model parts")
return num_parts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Convert a HuggingFace LLaMA model to a GGML compatible file")
parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)")
parser.add_argument("ftype", type=int, choices=[0, 1], help="output format - use 0 for float32, 1 for float16", default = 1)
return parser.parse_args()
args = parse_args()
dir_model = args.model
ftype = args.ftype
if not dir_model.is_dir():
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
# map from ftype to string
ftype_str = ["f32", "f16"]
if args.outfile is not None:
fname_out = args.outfile
else:
# output in the same directory as the model by default
fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf'
print("gguf: loading model "+dir_model.name)
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
print("hello print: ",hparams["architectures"][0])
if hparams["architectures"][0] != "BaichuanForCausalLM":
print("Model architecture not supported: " + hparams["architectures"][0])
sys.exit()
# get number of model parts
num_parts = count_model_parts(dir_model)
print(f"num_parts:{num_parts}\n")
ARCH=gguf.MODEL_ARCH.BAICHUAN
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
print("gguf: get model metadata")
block_count = hparams["num_hidden_layers"]
head_count = hparams["num_attention_heads"]
if "num_key_value_heads" in hparams:
head_count_kv = hparams["num_key_value_heads"]
else:
head_count_kv = head_count
if "_name_or_path" in hparams:
hf_repo = hparams["_name_or_path"]
else:
hf_repo = ""
if "max_sequence_length" in hparams:
ctx_length = hparams["max_sequence_length"]
elif "max_position_embeddings" in hparams:
ctx_length = hparams["max_position_embeddings"]
elif "model_max_length" in hparams:
ctx_length = hparams["model_max_length"]
else:
print("gguf: can not find ctx length parameter.")
sys.exit()
gguf_writer.add_name(dir_model.name)
gguf_writer.add_source_hf_repo(hf_repo)
gguf_writer.add_tensor_data_layout("Meta AI original pth")
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hparams["hidden_size"])
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams["intermediate_size"])
gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_layer_norm_rms_eps(hparams["rms_norm_eps"])
if "rope_scaling" in hparams and hparams["rope_scaling"] != None and "factor" in hparams["rope_scaling"]:
if "type" in hparams["rope_scaling"]:
if hparams["rope_scaling"]["type"] == "linear":
gguf_writer.add_rope_scale_linear(hparams["rope_scaling"]["factor"])
# TOKENIZATION
print("gguf: get tokenizer metadata")
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
tokenizer_model_file = dir_model / 'tokenizer.model'
if not tokenizer_model_file.is_file():
print(f'Error: Missing {tokenizer_model_file}', file = sys.stderr)
sys.exit(1)
# vocab type sentencepiece
print("gguf: get sentencepiece tokenizer vocab, scores and token types")
tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1 # defualt to normal token type
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
# toktype = 4 is user-defined = tokens from added_tokens.json
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
added_tokens_file = dir_model / 'added_tokens.json'
if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f:
addtokens_json = json.load(f)
print("gguf: get added tokens")
for key in addtokens_json:
tokens.append( key.encode("utf-8") )
scores.append(-1000.0)
toktypes.append(4) # user-defined token type
gguf_writer.add_tokenizer_model("llama")
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(dir_model)
special_vocab.add_to_gguf(gguf_writer)
# TENSORS
tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
# tensor info
print("gguf: get tensor metadata")
if num_parts == 0:
part_names = iter(("pytorch_model.bin",))
else:
part_names = (
f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
)
for part_name in part_names:
if args.vocab_only:
break
print("gguf: loading model part '" + part_name + "'")
model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu")
tmp=model_part
for i in range(block_count):
if f"model.layers.{i}.self_attn.W_pack.weight" in model_part:
print(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],0,head_count,head_count)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"]=reverse_hf_permute_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],1,head_count,head_count_kv)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"]=reverse_hf_part(model_part[f"model.layers.{i}.self_attn.W_pack.weight"],2)
del tmp[f"model.layers.{i}.self_attn.W_pack.weight"]
for name in model_part.keys():
data = model_part[name]
# we don't need these
if name.endswith(".rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
# convert any unsupported data types to float32
if data.dtype != torch.float16 and data.dtype != torch.float32:
data = data.to(torch.float32)
data = data.squeeze().numpy()
# map tensor names
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
n_dims = len(data.shape)
data_dtype = data.dtype
# if f32 desired, convert any float16 to float32
if ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)
# if f16 desired, convert any float32 2-dim weight tensors to float16
if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)
print(name + " -> " + new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
if not args.vocab_only:
print("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{fname_out}'")
print("")

View file

@ -137,7 +137,9 @@ with open(tokenizer_json_file, "r", encoding="utf-8") as f:
print("gguf: get gpt2 tokenizer vocab")
vocab_size = len(tokenizer_json["model"]["vocab"])
# The number of tokens in tokenizer.json can differ from the expected vocab size.
# This causes downstream issues with mismatched tensor sizes when running the inference
vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
tokenizer = AutoTokenizer.from_pretrained(dir_model)

51
examples/main-cmake-pkg/.gitignore vendored Normal file
View file

@ -0,0 +1,51 @@
# Prerequisites
*.d
# Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
*.gguf
*.log
.DS_Store
.build/
.cache/
.direnv/
.envrc
.swiftpm
.venv
.clang-tidy
.vs/
.vscode/
build*/
out/
tmp/

View file

@ -0,0 +1,36 @@
cmake_minimum_required(VERSION 3.12)
project("main-cmake-pkg" C CXX)
set(TARGET main-cmake-pkg)
find_package(Llama 0.0.1 REQUIRED)
# Bake common functionality in with target. Because applications
# using the relocatable Llama package should be outside of the
# source tree, main-cmake-pkg pretends the dependencies are built-in.
set(_common_path "${CMAKE_CURRENT_LIST_DIR}/../../common")
add_library(common OBJECT
${_common_path}/common.h
${_common_path}/common.cpp
${_common_path}/console.h
${_common_path}/console.cpp
${_common_path}/grammar-parser.h
${_common_path}/grammar-parser.cpp
)
# WARNING: because build-info.h is auto-generated, it will only
# be available after the user has built the llama.cpp sources.
#
configure_file(${_common_path}/../build-info.h
${CMAKE_CURRENT_BINARY_DIR}/build-info.h
COPYONLY)
target_include_directories(common PUBLIC ${LLAMA_INCLUDE_DIR}
${CMAKE_CURRENT_BINARY_DIR})
add_executable(${TARGET} ${CMAKE_CURRENT_LIST_DIR}/../main/main.cpp)
target_include_directories(${TARGET} PRIVATE ${_common_path})
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,37 @@
# llama.cpp/example/main-cmake-pkg
This program builds the [main](../main) application using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
## Building
Because this example is "outside of the source tree", it is important to first build/install llama.cpp using CMake. An example is provided here, but please see the [llama.cpp build instructions](../..) for more detailed build instructions.
### Considerations
When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
### Build llama.cpp and install to C:\LlamaCPP directory
In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
```cmd
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/LlamaCPP
```
### Build main-cmake-pkg
```cmd
cd ..\examples\main-cmake-pkg
mkdir build
cd build
cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
cmake --build . --config Release
cmake --install . --prefix C:/MyLlamaApp
```

View file

@ -274,7 +274,7 @@ These options help improve the performance and memory usage of the LLaMA models.
### NUMA support
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop\_caches' as root.
- `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
### Memory Float 32
@ -302,7 +302,6 @@ These options provide extra functionality and customization when running the LLa
- `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated.
- `--verbose-prompt`: Print the prompt before generating text.
- `--mtest`: Test the model's functionality by running a series of tests to ensure it's working properly.
- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.

View file

@ -198,23 +198,6 @@ int main(int argc, char ** argv) {
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
}
// determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters
// uncomment the "used_mem" line in llama.cpp to see the results
if (params.mem_test) {
{
LOG_TEE("%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx);
const std::vector<llama_token> tmp(params.n_batch, llama_token_bos(ctx));
llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads);
}
llama_print_timings(ctx);
llama_free(ctx);
llama_free_model(model);
return 0;
}
// export the cgraph and exit
if (params.export_cgraph) {
llama_eval_export(ctx, "llama.ggml");

View file

@ -42,6 +42,7 @@ int main(int argc, char ** argv) {
// load the draft model
params.model = params.model_draft;
params.n_gpu_layers = params.n_gpu_layers_draft;
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
// tokenize the prompt
@ -81,7 +82,7 @@ int main(int argc, char ** argv) {
//GGML_ASSERT(n_vocab == llama_n_vocab(ctx_dft));
// how many tokens to draft each time
const int n_draft = params.n_draft;
int n_draft = params.n_draft;
int n_predict = 0;
int n_drafted = 0;
@ -130,6 +131,7 @@ int main(int argc, char ** argv) {
LOG("drafted: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_dft, drafted));
int i_dft = 0;
while (true) {
// sample from the target model
const llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft);
@ -173,6 +175,27 @@ int main(int argc, char ** argv) {
llama_eval(ctx_dft, &id, 1, n_past_dft, params.n_threads);
++n_past_dft;
// heuristic for n_draft
{
const int n_draft_cur = (int) drafted.size();
const bool all_accepted = i_dft == n_draft_cur;
LOG("n_draft = %d\n", n_draft);
LOG("n_draft_cur = %d\n", n_draft_cur);
LOG("i_dft = %d\n", i_dft);
LOG("all_accepted = %d\n", all_accepted);
if (all_accepted && n_draft == n_draft_cur) {
LOG(" - max drafted tokens accepted - n_draft += 8\n");
n_draft = std::min(30, n_draft + 8);
} else if (all_accepted) {
LOG(" - partially drafted tokens accepted - no change\n");
} else {
LOG(" - drafted token rejected - n_draft -= 1\n");
n_draft = std::max(2, n_draft - 1);
}
}
drafted.clear();
drafted.push_back(id);

View file

@ -34,7 +34,7 @@
with pkgs; [ openblas ]
);
pkgs = import nixpkgs { inherit system; };
nativeBuildInputs = with pkgs; [ cmake ninja pkgconfig ];
nativeBuildInputs = with pkgs; [ cmake ninja pkg-config ];
llama-python =
pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]);
postPatch = ''
@ -45,6 +45,8 @@
postInstall = ''
mv $out/bin/main $out/bin/llama
mv $out/bin/server $out/bin/llama-server
mkdir -p $out/include
cp ${src}/llama.h $out/include/
'';
cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ];
in

View file

@ -13,7 +13,7 @@
#ifdef __HIP_PLATFORM_AMD__
// for rocblas_initialize()
#include "rocblas/rocblas.h"
#endif
#endif // __HIP_PLATFORM_AMD__
#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
@ -68,19 +68,29 @@
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif
#endif // defined(GGML_USE_HIPBLAS)
#include "ggml-cuda.h"
#include "ggml.h"
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
#ifndef CC_TURING
#define CC_TURING 700
#endif
#define CC_OFFSET_AMD 1000000
#define CC_RDNA2 CC_OFFSET_AMD + 1030
#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
defined(__gfx1150__) || defined(__gfx1151__)
#define RDNA3
#endif
#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
#define RDNA2
#endif
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
@ -132,7 +142,7 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
#endif
return c;
}
#endif
#endif // defined(GGML_USE_HIPBLAS)
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@ -3472,6 +3482,12 @@ static __device__ __forceinline__ void mul_mat_q(
}
}
#define MMQ_X_Q4_0_RDNA2 64
#define MMQ_Y_Q4_0_RDNA2 128
#define NWARPS_Q4_0_RDNA2 8
#define MMQ_X_Q4_0_RDNA1 64
#define MMQ_Y_Q4_0_RDNA1 64
#define NWARPS_Q4_0_RDNA1 8
#define MMQ_X_Q4_0_AMPERE 64
#define MMQ_Y_Q4_0_AMPERE 128
#define NWARPS_Q4_0_AMPERE 4
@ -3479,11 +3495,32 @@ static __device__ __forceinline__ void mul_mat_q(
#define MMQ_Y_Q4_0_PASCAL 64
#define NWARPS_Q4_0_PASCAL 8
template <bool need_check> static __global__ void mul_mat_q4_0(
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q4_0(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q4_0_RDNA2;
const int mmq_y = MMQ_Y_Q4_0_RDNA2;
const int nwarps = NWARPS_Q4_0_RDNA2;
#else
const int mmq_x = MMQ_X_Q4_0_RDNA1;
const int mmq_y = MMQ_Y_Q4_0_RDNA1;
const int nwarps = NWARPS_Q4_0_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q4_0_AMPERE;
const int mmq_y = MMQ_Y_Q4_0_AMPERE;
const int nwarps = NWARPS_Q4_0_AMPERE;
@ -3506,6 +3543,12 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q4_1_RDNA2 64
#define MMQ_Y_Q4_1_RDNA2 128
#define NWARPS_Q4_1_RDNA2 8
#define MMQ_X_Q4_1_RDNA1 64
#define MMQ_Y_Q4_1_RDNA1 64
#define NWARPS_Q4_1_RDNA1 8
#define MMQ_X_Q4_1_AMPERE 64
#define MMQ_Y_Q4_1_AMPERE 128
#define NWARPS_Q4_1_AMPERE 4
@ -3514,14 +3557,33 @@ template <bool need_check> static __global__ void mul_mat_q4_0(
#define NWARPS_Q4_1_PASCAL 8
template <bool need_check> static __global__ void
#if __CUDA_ARCH__ < CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_TURING
__launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_TURING
mul_mat_q4_1(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q4_1_RDNA2;
const int mmq_y = MMQ_Y_Q4_1_RDNA2;
const int nwarps = NWARPS_Q4_1_RDNA2;
#else
const int mmq_x = MMQ_X_Q4_1_RDNA1;
const int mmq_y = MMQ_Y_Q4_1_RDNA1;
const int nwarps = NWARPS_Q4_1_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q4_1_AMPERE;
const int mmq_y = MMQ_Y_Q4_1_AMPERE;
const int nwarps = NWARPS_Q4_1_AMPERE;
@ -3544,6 +3606,12 @@ template <bool need_check> static __global__ void
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q5_0_RDNA2 64
#define MMQ_Y_Q5_0_RDNA2 128
#define NWARPS_Q5_0_RDNA2 8
#define MMQ_X_Q5_0_RDNA1 64
#define MMQ_Y_Q5_0_RDNA1 64
#define NWARPS_Q5_0_RDNA1 8
#define MMQ_X_Q5_0_AMPERE 128
#define MMQ_Y_Q5_0_AMPERE 64
#define NWARPS_Q5_0_AMPERE 4
@ -3551,11 +3619,32 @@ template <bool need_check> static __global__ void
#define MMQ_Y_Q5_0_PASCAL 64
#define NWARPS_Q5_0_PASCAL 8
template <bool need_check> static __global__ void mul_mat_q5_0(
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_0(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q5_0_RDNA2;
const int mmq_y = MMQ_Y_Q5_0_RDNA2;
const int nwarps = NWARPS_Q5_0_RDNA2;
#else
const int mmq_x = MMQ_X_Q5_0_RDNA1;
const int mmq_y = MMQ_Y_Q5_0_RDNA1;
const int nwarps = NWARPS_Q5_0_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q5_0_AMPERE;
const int mmq_y = MMQ_Y_Q5_0_AMPERE;
const int nwarps = NWARPS_Q5_0_AMPERE;
@ -3578,6 +3667,12 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q5_1_RDNA2 64
#define MMQ_Y_Q5_1_RDNA2 128
#define NWARPS_Q5_1_RDNA2 8
#define MMQ_X_Q5_1_RDNA1 64
#define MMQ_Y_Q5_1_RDNA1 64
#define NWARPS_Q5_1_RDNA1 8
#define MMQ_X_Q5_1_AMPERE 128
#define MMQ_Y_Q5_1_AMPERE 64
#define NWARPS_Q5_1_AMPERE 4
@ -3585,11 +3680,32 @@ template <bool need_check> static __global__ void mul_mat_q5_0(
#define MMQ_Y_Q5_1_PASCAL 64
#define NWARPS_Q5_1_PASCAL 8
template <bool need_check> static __global__ void mul_mat_q5_1(
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_1(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q5_1_RDNA2;
const int mmq_y = MMQ_Y_Q5_1_RDNA2;
const int nwarps = NWARPS_Q5_1_RDNA2;
#else
const int mmq_x = MMQ_X_Q5_1_RDNA1;
const int mmq_y = MMQ_Y_Q5_1_RDNA1;
const int nwarps = NWARPS_Q5_1_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q5_1_AMPERE;
const int mmq_y = MMQ_Y_Q5_1_AMPERE;
const int nwarps = NWARPS_Q5_1_AMPERE;
@ -3612,6 +3728,12 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q8_0_RDNA2 64
#define MMQ_Y_Q8_0_RDNA2 128
#define NWARPS_Q8_0_RDNA2 8
#define MMQ_X_Q8_0_RDNA1 64
#define MMQ_Y_Q8_0_RDNA1 64
#define NWARPS_Q8_0_RDNA1 8
#define MMQ_X_Q8_0_AMPERE 128
#define MMQ_Y_Q8_0_AMPERE 64
#define NWARPS_Q8_0_AMPERE 4
@ -3619,11 +3741,32 @@ template <bool need_check> static __global__ void mul_mat_q5_1(
#define MMQ_Y_Q8_0_PASCAL 64
#define NWARPS_Q8_0_PASCAL 8
template <bool need_check> static __global__ void mul_mat_q8_0(
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q8_0(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q8_0_RDNA2;
const int mmq_y = MMQ_Y_Q8_0_RDNA2;
const int nwarps = NWARPS_Q8_0_RDNA2;
#else
const int mmq_x = MMQ_X_Q8_0_RDNA1;
const int mmq_y = MMQ_Y_Q8_0_RDNA1;
const int nwarps = NWARPS_Q8_0_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q8_0_AMPERE;
const int mmq_y = MMQ_Y_Q8_0_AMPERE;
const int nwarps = NWARPS_Q8_0_AMPERE;
@ -3646,6 +3789,12 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q2_K_RDNA2 64
#define MMQ_Y_Q2_K_RDNA2 128
#define NWARPS_Q2_K_RDNA2 8
#define MMQ_X_Q2_K_RDNA1 128
#define MMQ_Y_Q2_K_RDNA1 32
#define NWARPS_Q2_K_RDNA1 8
#define MMQ_X_Q2_K_AMPERE 64
#define MMQ_Y_Q2_K_AMPERE 128
#define NWARPS_Q2_K_AMPERE 4
@ -3653,11 +3802,32 @@ template <bool need_check> static __global__ void mul_mat_q8_0(
#define MMQ_Y_Q2_K_PASCAL 64
#define NWARPS_Q2_K_PASCAL 8
template <bool need_check> static __global__ void mul_mat_q2_K(
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q2_K(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q2_K_RDNA2;
const int mmq_y = MMQ_Y_Q2_K_RDNA2;
const int nwarps = NWARPS_Q2_K_RDNA2;
#else
const int mmq_x = MMQ_X_Q2_K_RDNA1;
const int mmq_y = MMQ_Y_Q2_K_RDNA1;
const int nwarps = NWARPS_Q2_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q2_K_AMPERE;
const int mmq_y = MMQ_Y_Q2_K_AMPERE;
const int nwarps = NWARPS_Q2_K_AMPERE;
@ -3680,6 +3850,12 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q3_K_RDNA2 128
#define MMQ_Y_Q3_K_RDNA2 64
#define NWARPS_Q3_K_RDNA2 8
#define MMQ_X_Q3_K_RDNA1 32
#define MMQ_Y_Q3_K_RDNA1 128
#define NWARPS_Q3_K_RDNA1 8
#define MMQ_X_Q3_K_AMPERE 128
#define MMQ_Y_Q3_K_AMPERE 128
#define NWARPS_Q3_K_AMPERE 4
@ -3688,14 +3864,33 @@ template <bool need_check> static __global__ void mul_mat_q2_K(
#define NWARPS_Q3_K_PASCAL 8
template <bool need_check> static __global__ void
#if __CUDA_ARCH__ < CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_TURING
__launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_TURING
mul_mat_q3_K(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q3_K_RDNA2;
const int mmq_y = MMQ_Y_Q3_K_RDNA2;
const int nwarps = NWARPS_Q3_K_RDNA2;
#else
const int mmq_x = MMQ_X_Q3_K_RDNA1;
const int mmq_y = MMQ_Y_Q3_K_RDNA1;
const int nwarps = NWARPS_Q3_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q3_K_AMPERE;
const int mmq_y = MMQ_Y_Q3_K_AMPERE;
const int nwarps = NWARPS_Q3_K_AMPERE;
@ -3718,6 +3913,12 @@ template <bool need_check> static __global__ void
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q4_K_RDNA2 64
#define MMQ_Y_Q4_K_RDNA2 128
#define NWARPS_Q4_K_RDNA2 8
#define MMQ_X_Q4_K_RDNA1 32
#define MMQ_Y_Q4_K_RDNA1 64
#define NWARPS_Q4_K_RDNA1 8
#define MMQ_X_Q4_K_AMPERE 64
#define MMQ_Y_Q4_K_AMPERE 128
#define NWARPS_Q4_K_AMPERE 4
@ -3726,14 +3927,33 @@ template <bool need_check> static __global__ void
#define NWARPS_Q4_K_PASCAL 8
template <bool need_check> static __global__ void
#if __CUDA_ARCH__ < CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_TURING
__launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_TURING
mul_mat_q4_K(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q4_K_RDNA2;
const int mmq_y = MMQ_Y_Q4_K_RDNA2;
const int nwarps = NWARPS_Q4_K_RDNA2;
#else
const int mmq_x = MMQ_X_Q4_K_RDNA1;
const int mmq_y = MMQ_Y_Q4_K_RDNA1;
const int nwarps = NWARPS_Q4_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q4_K_AMPERE;
const int mmq_y = MMQ_Y_Q4_K_AMPERE;
const int nwarps = NWARPS_Q4_K_AMPERE;
@ -3756,6 +3976,12 @@ template <bool need_check> static __global__ void
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q5_K_RDNA2 64
#define MMQ_Y_Q5_K_RDNA2 128
#define NWARPS_Q5_K_RDNA2 8
#define MMQ_X_Q5_K_RDNA1 32
#define MMQ_Y_Q5_K_RDNA1 64
#define NWARPS_Q5_K_RDNA1 8
#define MMQ_X_Q5_K_AMPERE 64
#define MMQ_Y_Q5_K_AMPERE 128
#define NWARPS_Q5_K_AMPERE 4
@ -3763,11 +3989,32 @@ template <bool need_check> static __global__ void
#define MMQ_Y_Q5_K_PASCAL 64
#define NWARPS_Q5_K_PASCAL 8
template <bool need_check> static __global__ void mul_mat_q5_K(
template <bool need_check> static __global__ void
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
mul_mat_q5_K(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q5_K_RDNA2;
const int mmq_y = MMQ_Y_Q5_K_RDNA2;
const int nwarps = NWARPS_Q5_K_RDNA2;
#else
const int mmq_x = MMQ_X_Q5_K_RDNA1;
const int mmq_y = MMQ_Y_Q5_K_RDNA1;
const int nwarps = NWARPS_Q5_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q5_K_AMPERE;
const int mmq_y = MMQ_Y_Q5_K_AMPERE;
const int nwarps = NWARPS_Q5_K_AMPERE;
@ -3790,6 +4037,12 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
#endif // __CUDA_ARCH__ >= CC_TURING
}
#define MMQ_X_Q6_K_RDNA2 64
#define MMQ_Y_Q6_K_RDNA2 128
#define NWARPS_Q6_K_RDNA2 8
#define MMQ_X_Q6_K_RDNA1 32
#define MMQ_Y_Q6_K_RDNA1 64
#define NWARPS_Q6_K_RDNA1 8
#define MMQ_X_Q6_K_AMPERE 64
#define MMQ_Y_Q6_K_AMPERE 64
#define NWARPS_Q6_K_AMPERE 4
@ -3798,14 +4051,33 @@ template <bool need_check> static __global__ void mul_mat_q5_K(
#define NWARPS_Q6_K_PASCAL 8
template <bool need_check> static __global__ void
#if __CUDA_ARCH__ < CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
#endif // defined(RDNA3) || defined(RDNA2)
#elif __CUDA_ARCH__ < CC_TURING
__launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
#endif // __CUDA_ARCH__ < CC_TURING
mul_mat_q6_K(
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
#if __CUDA_ARCH__ >= CC_TURING
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#if defined(RDNA3) || defined(RDNA2)
const int mmq_x = MMQ_X_Q6_K_RDNA2;
const int mmq_y = MMQ_Y_Q6_K_RDNA2;
const int nwarps = NWARPS_Q6_K_RDNA2;
#else
const int mmq_x = MMQ_X_Q6_K_RDNA1;
const int mmq_y = MMQ_Y_Q6_K_RDNA1;
const int nwarps = NWARPS_Q6_K_RDNA1;
#endif // defined(RDNA3) || defined(RDNA2)
mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
#elif __CUDA_ARCH__ >= CC_TURING
const int mmq_x = MMQ_X_Q6_K_AMPERE;
const int mmq_y = MMQ_Y_Q6_K_AMPERE;
const int nwarps = NWARPS_Q6_K_AMPERE;
@ -4588,7 +4860,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q4_0_RDNA2;
mmq_y = MMQ_Y_Q4_0_RDNA2;
nwarps = NWARPS_Q4_0_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q4_0_RDNA1;
mmq_y = MMQ_Y_Q4_0_RDNA1;
nwarps = NWARPS_Q4_0_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q4_0_AMPERE;
mmq_y = MMQ_Y_Q4_0_AMPERE;
nwarps = NWARPS_Q4_0_AMPERE;
@ -4625,7 +4905,15 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q4_1_RDNA2;
mmq_y = MMQ_Y_Q4_1_RDNA2;
nwarps = NWARPS_Q4_1_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q4_1_RDNA1;
mmq_y = MMQ_Y_Q4_1_RDNA1;
nwarps = NWARPS_Q4_1_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q4_1_AMPERE;
mmq_y = MMQ_Y_Q4_1_AMPERE;
nwarps = NWARPS_Q4_1_AMPERE;
@ -4662,7 +4950,15 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q5_0_RDNA2;
mmq_y = MMQ_Y_Q5_0_RDNA2;
nwarps = NWARPS_Q5_0_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q5_0_RDNA1;
mmq_y = MMQ_Y_Q5_0_RDNA1;
nwarps = NWARPS_Q5_0_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q5_0_AMPERE;
mmq_y = MMQ_Y_Q5_0_AMPERE;
nwarps = NWARPS_Q5_0_AMPERE;
@ -4699,7 +4995,15 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q5_1_RDNA2;
mmq_y = MMQ_Y_Q5_1_RDNA2;
nwarps = NWARPS_Q5_1_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q5_1_RDNA1;
mmq_y = MMQ_Y_Q5_1_RDNA1;
nwarps = NWARPS_Q5_1_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q5_1_AMPERE;
mmq_y = MMQ_Y_Q5_1_AMPERE;
nwarps = NWARPS_Q5_1_AMPERE;
@ -4736,7 +5040,15 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q8_0_RDNA2;
mmq_y = MMQ_Y_Q8_0_RDNA2;
nwarps = NWARPS_Q8_0_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q8_0_RDNA1;
mmq_y = MMQ_Y_Q8_0_RDNA1;
nwarps = NWARPS_Q8_0_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q8_0_AMPERE;
mmq_y = MMQ_Y_Q8_0_AMPERE;
nwarps = NWARPS_Q8_0_AMPERE;
@ -4773,7 +5085,15 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q2_K_RDNA2;
mmq_y = MMQ_Y_Q2_K_RDNA2;
nwarps = NWARPS_Q2_K_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q2_K_RDNA1;
mmq_y = MMQ_Y_Q2_K_RDNA1;
nwarps = NWARPS_Q2_K_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q2_K_AMPERE;
mmq_y = MMQ_Y_Q2_K_AMPERE;
nwarps = NWARPS_Q2_K_AMPERE;
@ -4812,7 +5132,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q3_K_RDNA2;
mmq_y = MMQ_Y_Q3_K_RDNA2;
nwarps = NWARPS_Q3_K_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q3_K_RDNA1;
mmq_y = MMQ_Y_Q3_K_RDNA1;
nwarps = NWARPS_Q3_K_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q3_K_AMPERE;
mmq_y = MMQ_Y_Q3_K_AMPERE;
nwarps = NWARPS_Q3_K_AMPERE;
@ -4850,7 +5178,15 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q4_K_RDNA2;
mmq_y = MMQ_Y_Q4_K_RDNA2;
nwarps = NWARPS_Q4_K_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q4_K_RDNA1;
mmq_y = MMQ_Y_Q4_K_RDNA1;
nwarps = NWARPS_Q4_K_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q4_K_AMPERE;
mmq_y = MMQ_Y_Q4_K_AMPERE;
nwarps = NWARPS_Q4_K_AMPERE;
@ -4887,7 +5223,15 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q5_K_RDNA2;
mmq_y = MMQ_Y_Q5_K_RDNA2;
nwarps = NWARPS_Q5_K_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q5_K_RDNA1;
mmq_y = MMQ_Y_Q5_K_RDNA1;
nwarps = NWARPS_Q5_K_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q5_K_AMPERE;
mmq_y = MMQ_Y_Q5_K_AMPERE;
nwarps = NWARPS_Q5_K_AMPERE;
@ -4924,7 +5268,15 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
const int compute_capability = g_compute_capabilities[id];
int mmq_x, mmq_y, nwarps;
if (compute_capability >= CC_TURING) {
if (compute_capability >= CC_RDNA2) {
mmq_x = MMQ_X_Q6_K_RDNA2;
mmq_y = MMQ_Y_Q6_K_RDNA2;
nwarps = NWARPS_Q6_K_RDNA2;
} else if (compute_capability >= CC_OFFSET_AMD) {
mmq_x = MMQ_X_Q6_K_RDNA1;
mmq_y = MMQ_Y_Q6_K_RDNA1;
nwarps = NWARPS_Q6_K_RDNA1;
} else if (compute_capability >= CC_TURING) {
mmq_x = MMQ_X_Q6_K_AMPERE;
mmq_y = MMQ_Y_Q6_K_AMPERE;
nwarps = NWARPS_Q6_K_AMPERE;
@ -5165,8 +5517,11 @@ void ggml_init_cublas() {
g_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
#else
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}
for (int64_t id = 0; id < g_device_count; ++id) {
g_tensor_split[id] /= total_vram;
@ -5247,7 +5602,8 @@ static cudaError_t ggml_cuda_cpy_tensor_2d(
if (src->backend == GGML_BACKEND_CPU) {
kind = cudaMemcpyHostToDevice;
src_ptr = (char *) src->data;
} else if (src->backend == GGML_BACKEND_GPU) {
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
kind = cudaMemcpyDeviceToDevice;
struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
int id;
@ -5289,9 +5645,7 @@ inline void ggml_cuda_op_add(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne10 = src1->ne[0];
const int64_t ne11 = src1->ne[1];
@ -5452,14 +5806,41 @@ inline void ggml_cuda_op_mul_mat_q(
}
static int64_t get_row_rounding(ggml_type type) {
int max_compute_capability = INT_MIN;
for (int id = 0; id < g_device_count; ++id) {
if (max_compute_capability < g_compute_capabilities[id]
&& g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
int64_t min_compute_capability = INT_MAX;
int64_t max_compute_capability = INT_MIN;
for (int64_t id = 0; id < g_device_count; ++id) {
if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
if (min_compute_capability > g_compute_capabilities[id]) {
min_compute_capability = g_compute_capabilities[id];
}
if (max_compute_capability < g_compute_capabilities[id]) {
max_compute_capability = g_compute_capabilities[id];
}
}
}
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
switch(type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
case GGML_TYPE_F16:
return 1;
case GGML_TYPE_Q2_K:
return max_compute_capability >= CC_RDNA2 ? 128 : 32;
case GGML_TYPE_Q3_K:
return min_compute_capability < CC_RDNA2 ? 128 : 64;
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
return max_compute_capability >= CC_RDNA2 ? 128 : 64;
default:
GGML_ASSERT(false);
}
#else
switch(type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
@ -5480,6 +5861,7 @@ static int64_t get_row_rounding(ggml_type type) {
default:
GGML_ASSERT(false);
}
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}
inline void ggml_cuda_op_mul_mat_vec_q(
@ -5631,10 +6013,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
const int64_t ne0 = dst->ne[0];
const int64_t row_diff = row_high - row_low;
float * src0_ddq_as_f32;
size_t src0_as = 0;
if (src0->type != GGML_TYPE_F32) {
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
size_t src0_as;
float * src0_ddf_i = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as);
to_fp32_cuda(src0_dd_i, src0_ddf_i, row_diff*ne00, stream);
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
}
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
int id;
CUDA_CHECK(cudaGetDevice(&id));
@ -5651,10 +6038,11 @@ inline void ggml_cuda_op_mul_mat_cublas(
src1_ddf_i, ne10,
&beta, dst_dd_i, ldc));
ggml_cuda_pool_free(src0_ddf_i, src0_as);
if (src0_as > 0) {
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
}
(void) dst;
(void) src0_dd_i;
(void) src1_ddq_i;
(void) src1_padded_row_size;
}
@ -5793,7 +6181,6 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
const bool use_src1 = src1 != nullptr;
const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
GGML_ASSERT( src0->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
@ -5801,7 +6188,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU;
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU;

View file

@ -69,6 +69,7 @@ struct ggml_metal_context {
GGML_METAL_DECL_KERNEL(soft_max_4);
GGML_METAL_DECL_KERNEL(diag_mask_inf);
GGML_METAL_DECL_KERNEL(diag_mask_inf_8);
GGML_METAL_DECL_KERNEL(get_rows_f32);
GGML_METAL_DECL_KERNEL(get_rows_f16);
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
@ -177,7 +178,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
ctx->n_buffers = 0;
ctx->concur_list_len = 0;
ctx->d_queue = dispatch_queue_create("llama.cpp", DISPATCH_QUEUE_CONCURRENT);
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
#ifdef GGML_SWIFT
// load the default.metallib file
@ -256,6 +257,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(soft_max_4);
GGML_METAL_ADD_KERNEL(diag_mask_inf);
GGML_METAL_ADD_KERNEL(diag_mask_inf_8);
GGML_METAL_ADD_KERNEL(get_rows_f32);
GGML_METAL_ADD_KERNEL(get_rows_f16);
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
@ -325,7 +327,9 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
GGML_METAL_DEL_KERNEL(gelu);
GGML_METAL_DEL_KERNEL(soft_max);
GGML_METAL_DEL_KERNEL(soft_max_4);
GGML_METAL_DEL_KERNEL(diag_mask_inf);
GGML_METAL_DEL_KERNEL(diag_mask_inf_8);
GGML_METAL_DEL_KERNEL(get_rows_f32);
GGML_METAL_DEL_KERNEL(get_rows_f16);
GGML_METAL_DEL_KERNEL(get_rows_q4_0);
GGML_METAL_DEL_KERNEL(get_rows_q4_1);
@ -418,6 +422,7 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
for (int i = 0; i < ctx->n_buffers; ++i) {
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
//metal_printf("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
*offs = (size_t) ioffs;
@ -755,6 +760,7 @@ void ggml_metal_graph_compute(
case GGML_OP_ADD:
{
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
// utilize float4
GGML_ASSERT(ne00 % 4 == 0);
@ -762,6 +768,7 @@ void ggml_metal_graph_compute(
if (ggml_nelements(src1) == ne10) {
// src1 is a row
GGML_ASSERT(ne11 == 1);
[encoder setComputePipelineState:ctx->pipeline_add_row];
} else {
[encoder setComputePipelineState:ctx->pipeline_add];
@ -778,6 +785,7 @@ void ggml_metal_graph_compute(
case GGML_OP_MUL:
{
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
// utilize float4
GGML_ASSERT(ne00 % 4 == 0);
@ -785,6 +793,7 @@ void ggml_metal_graph_compute(
if (ggml_nelements(src1) == ne10) {
// src1 is a row
GGML_ASSERT(ne11 == 1);
[encoder setComputePipelineState:ctx->pipeline_mul_row];
} else {
[encoder setComputePipelineState:ctx->pipeline_mul];
@ -800,6 +809,8 @@ void ggml_metal_graph_compute(
} break;
case GGML_OP_SCALE:
{
GGML_ASSERT(ggml_is_contiguous(src0));
const float scale = *(const float *) src1->data;
[encoder setComputePipelineState:ctx->pipeline_scale];
@ -899,8 +910,8 @@ void ggml_metal_graph_compute(
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
if (!ggml_is_transposed(src0) &&
!ggml_is_transposed(src1) &&
src1t == GGML_TYPE_F32 &&
[ctx->device supportsFamily:MTLGPUFamilyApple7] &&
ne00%32 == 0 &&
@ -925,9 +936,12 @@ void ggml_metal_graph_compute(
[encoder setBytes:&nb01 length:sizeof(nb01) atIndex:5];
[encoder setBytes:&nb02 length:sizeof(nb02) atIndex:6];
[encoder setBytes:&ne12 length:sizeof(ne12) atIndex:7];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:8];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:9];
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:10];
[encoder setBytes:&nb10 length:sizeof(nb10) atIndex:8];
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:9];
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:10];
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:11];
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12];
[encoder setBytes:&gqa length:sizeof(gqa) atIndex:13];
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
} else {
@ -1077,6 +1091,7 @@ void ggml_metal_graph_compute(
case GGML_OP_GET_ROWS:
{
switch (src0->type) {
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_get_rows_f32]; break;
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
@ -1092,9 +1107,9 @@ void ggml_metal_graph_compute(
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&(src0->ne[0]) length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&(src0->nb[1]) length:sizeof(uint64_t) atIndex:4];
[encoder setBytes:&(dst->nb[1]) length:sizeof(uint64_t) atIndex:5];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:5];
const int64_t n = ggml_nelements(src1);

View file

@ -1321,7 +1321,6 @@ kernel void kernel_mul_mat_q3_K_f32(
dst[r1*ne0 + r2*ne0*ne1 + first_row + row] = sumf1[row];
}
}
}
#else
kernel void kernel_mul_mat_q3_K_f32(
@ -1865,6 +1864,15 @@ kernel void kernel_mul_mat_q6_K_f32(
//============================= templates and their specializations =============================
// NOTE: this is not dequantizing - we are simply fitting the template
template <typename type4x4>
void dequantize_f32(device const float4x4 * src, short il, thread type4x4 & reg) {
float4x4 temp = *(((device float4x4 *)src));
for (int i = 0; i < 16; i++){
reg[i/4][i%4] = temp[i/4][i%4];
}
}
template <typename type4x4>
void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg) {
half4x4 temp = *(((device half4x4 *)src));
@ -1875,7 +1883,6 @@ void dequantize_f16(device const half4x4 * src, short il, thread type4x4 & reg)
template <typename type4x4>
void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 1);
const float d1 = il ? (xb->d / 16.h) : xb->d;
const float d2 = d1 / 256.f;
@ -1887,12 +1894,10 @@ void dequantize_q4_0(device const block_q4_0 *xb, short il, thread type4x4 & reg
reg[i/2][2*(i%2)+0] = d1 * (qs[i] & mask0) + md;
reg[i/2][2*(i%2)+1] = d2 * (qs[i] & mask1) + md;
}
}
template <typename type4x4>
void dequantize_q4_1(device const block_q4_1 *xb, short il, thread type4x4 & reg) {
device const uint16_t * qs = ((device const uint16_t *)xb + 2);
const float d1 = il ? (xb->d / 16.h) : xb->d;
const float d2 = d1 / 256.f;
@ -1964,7 +1969,6 @@ void dequantize_q3_K(device const block_q3_K *xb, short il, thread type4x4 & reg
for (int i = 0; i < 16; ++i) {
reg[i/4][i%4] = dl * (q[i] & mask) - (h[i] & m ? 0 : ml);
}
#else
float kcoef = il&1 ? 1.f/16.f : 1.f;
uint16_t kmask = il&1 ? 0xF0 : 0x0F;
@ -2110,13 +2114,16 @@ kernel void kernel_get_rows(
// each block_q contains 16*nl weights
template<typename block_q, short nl, void (*dequantize_func)(device const block_q *, short, thread half4x4 &)>
kernel void kernel_mul_mm(device const uchar * src0,
device const float * src1,
device const uchar * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne02,
constant int64_t & nb01,
constant int64_t & nb02,
constant int64_t & ne12,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & gqa,
@ -2125,7 +2132,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
uint tiitg[[thread_index_in_threadgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]]) {
threadgroup half * sa = ((threadgroup half *)shared_memory);
threadgroup half * sa = (threadgroup half *)(shared_memory);
threadgroup float * sb = (threadgroup float *)(shared_memory + 4096);
const uint r0 = tgpig.y;
@ -2146,10 +2153,15 @@ kernel void kernel_mul_mm(device const uchar * src0,
}
short il = (tiitg % THREAD_PER_ROW);
uint offset0 = im/gqa*nb02; ushort offset1 = il/nl;
uint offset0 = im/gqa*nb02;
ushort offset1 = il/nl;
device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1;
device const float * y = src1 + (r1 * BLOCK_SIZE_N + thread_col) * ne00 \
+ BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL) + im * ne00 * ne1;
device const float * y = (device const float *)(src1
+ nb12 * im
+ nb11 * (r1 * BLOCK_SIZE_N + thread_col)
+ nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL)));
for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) {
//load data and store to threadgroup memory
@ -2229,6 +2241,7 @@ kernel void kernel_mul_mm(device const uchar * src0,
typedef void (get_rows_t)(device const void *, device const int *, device float *, constant int64_t &, \
constant uint64_t &, constant uint64_t &, uint, uint, uint);
template [[host_name("kernel_get_rows_f32")]] kernel get_rows_t kernel_get_rows<float4x4, 1, dequantize_f32>;
template [[host_name("kernel_get_rows_f16")]] kernel get_rows_t kernel_get_rows<half4x4, 1, dequantize_f16>;
template [[host_name("kernel_get_rows_q4_0")]] kernel get_rows_t kernel_get_rows<block_q4_0, 2, dequantize_q4_0>;
template [[host_name("kernel_get_rows_q4_1")]] kernel get_rows_t kernel_get_rows<block_q4_1, 2, dequantize_q4_1>;
@ -2239,9 +2252,22 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows
template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows<block_q5_K, QK_NL, dequantize_q5_K>;
template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows<block_q6_K, QK_NL, dequantize_q6_K>;
typedef void (mat_mm_t)(device const uchar *, device const float *, device float *, constant int64_t &,\
constant int64_t &, constant int64_t &, constant int64_t &, constant int64_t &, \
constant int64_t &, constant int64_t &, constant uint &, threadgroup uchar *, uint3, uint, uint);
typedef void (mat_mm_t)(
device const uchar * src0,
device const uchar * src1,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne02,
constant int64_t & nb01,
constant int64_t & nb02,
constant int64_t & ne12,
constant int64_t & nb10,
constant int64_t & nb11,
constant int64_t & nb12,
constant int64_t & ne0,
constant int64_t & ne1,
constant uint & gqa,
threadgroup uchar *, uint3, uint, uint);
template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm<half4x4, 1, dequantize_f16>;
template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm<block_q4_0, 2, dequantize_q4_0>;

18
ggml.c
View file

@ -283,7 +283,7 @@ typedef double ggml_float;
// 16-bit float
// on Arm, we use __fp16
// on x86, we use uint16_t
#ifdef __ARM_NEON
#if defined(__ARM_NEON) && !defined(_MSC_VER)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
@ -4303,10 +4303,21 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) {
}
size_t ggml_nbytes(const struct ggml_tensor * tensor) {
size_t nbytes = tensor->ne[0]*tensor->nb[0]/ggml_blck_size(tensor->type);
size_t nbytes;
size_t blck_size = ggml_blck_size(tensor->type);
if (blck_size == 1) {
nbytes = ggml_type_size(tensor->type);
for (int i = 0; i < GGML_MAX_DIMS; ++i) {
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
}
}
else {
nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
for (int i = 1; i < GGML_MAX_DIMS; ++i) {
nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
}
}
return nbytes;
}
@ -18340,7 +18351,8 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) {
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 "] %8s\n",
i,
node->ne[0], node->ne[1],
ggml_op_name(node->op));
ggml_op_name(node->op),
ggml_get_name(node));
}
for (int i = 0; i < GGML_OP_COUNT; i++) {

2
ggml.h
View file

@ -270,7 +270,7 @@ extern "C" {
#if defined(__ARM_NEON) && defined(__CUDACC__)
typedef half ggml_fp16_t;
#elif defined(__ARM_NEON)
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
typedef __fp16 ggml_fp16_t;
#else
typedef uint16_t ggml_fp16_t;

View file

@ -79,6 +79,7 @@ KEY_TOKENIZER_RWKV = "tokenizer.rwkv.world"
class MODEL_ARCH(IntEnum):
LLAMA : int = auto()
FALCON : int = auto()
BAICHUAN:int = auto()
GPT2 : int = auto()
GPTJ : int = auto()
GPTNEOX: int = auto()
@ -108,6 +109,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
MODEL_ARCH.LLAMA: "llama",
MODEL_ARCH.FALCON: "falcon",
MODEL_ARCH.BAICHUAN:"baichuan",
MODEL_ARCH.GPT2: "gpt2",
MODEL_ARCH.GPTJ: "gptj",
MODEL_ARCH.GPTNEOX: "gptneox",
@ -153,6 +155,22 @@ MODEL_TENSOR_NAMES: dict[MODEL_ARCH, dict[MODEL_TENSOR, str]] = {
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
},
MODEL_ARCH.BAICHUAN: {
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
MODEL_TENSOR.OUTPUT: "output",
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
},
MODEL_ARCH.GPT2: {
# TODO
},
@ -165,6 +183,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
MODEL_ARCH.BAICHUAN: [
MODEL_TENSOR.ROPE_FREQS,
MODEL_TENSOR.ATTN_ROT_EMBD,
],
}
@ -187,7 +209,7 @@ class TensorNameMap:
# Output
MODEL_TENSOR.OUTPUT: (
"embed_out", # gptneox
"lm_head", # gpt2 mpt falcon llama-hf
"lm_head", # gpt2 mpt falcon llama-hf baichuan
"output", # llama-pth
),
@ -195,7 +217,7 @@ class TensorNameMap:
MODEL_TENSOR.OUTPUT_NORM: (
"gpt_neox.final_layer_norm", # gptneox
"transformer.ln_f", # gpt2 falcon
"model.norm", # llama-hf
"model.norm", # llama-hf baichuan
"norm", # llama-pth
),
@ -311,6 +333,7 @@ class TensorNameMap:
tensor_name = tensor_names.get(tensor)
if tensor_name is None:
continue
mapping[tensor_name] = (tensor, tensor_name)
for key in keys:
mapping[key] = (tensor, tensor_name)
for bid in range(n_blocks):
@ -319,11 +342,12 @@ class TensorNameMap:
if tensor_name is None:
continue
tensor_name = tensor_name.format(bid = bid)
mapping[tensor_name] = (tensor, tensor_name)
for key in keys:
key = key.format(bid = bid)
mapping[key] = (tensor, tensor_name)
def get_type_and_name(self, key: str, try_suffixes: Sequence[str]) -> tuple[MODEL_TENSOR, str] | None:
def get_type_and_name(self, key: str, try_suffixes: Sequence[str] = ()) -> tuple[MODEL_TENSOR, str] | None:
result = self.mapping.get(key)
if result is not None:
return result
@ -334,13 +358,13 @@ class TensorNameMap:
return (result[0], result[1] + suffix)
return None
def get_name(self, key: str, try_suffixes: Sequence[str]) -> str | None:
def get_name(self, key: str, try_suffixes: Sequence[str] = ()) -> str | None:
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
if result is None:
return None
return result[1]
def get_type(self, key: str, try_suffixes: Sequence[str]) -> MODEL_TENSOR | None:
def get_type(self, key: str, try_suffixes: Sequence[str] = ()) -> MODEL_TENSOR | None:
result = self.get_type_and_name(key, try_suffixes = try_suffixes)
if result is None:
return None

View file

@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
version = "0.3.2"
version = "0.3.3"
description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [

View file

@ -2609,7 +2609,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
memcpy(utmp, x[i].scales, 12);
const uint32x2_t mins8 = {utmp[1] & kmask1, ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4)};
uint32x2_t mins8 = { 0 };
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[0] &= kmask1;

736
llama.cpp
View file

@ -158,6 +158,7 @@ static std::string format(const char * fmt, ...) {
enum llm_arch {
LLM_ARCH_LLAMA,
LLM_ARCH_FALCON,
LLM_ARCH_BAICHUAN,
LLM_ARCH_GPT2,
LLM_ARCH_GPTJ,
LLM_ARCH_GPTNEOX,
@ -172,6 +173,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
{ LLM_ARCH_GPTJ, "gptj" },
{ LLM_ARCH_GPTNEOX, "gptneox" },
{ LLM_ARCH_MPT, "mpt" },
{ LLM_ARCH_BAICHUAN,"baichuan" },
};
enum llm_kv {
@ -312,6 +314,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_BAICHUAN,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_FALCON,
{
@ -1686,6 +1707,15 @@ static void llm_load_hparams(
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
case LLM_ARCH_BAICHUAN:
{
GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
switch (hparams.n_layer) {
case 32: model.type = e_model::MODEL_7B; break;
case 40: model.type = e_model::MODEL_13B; break;
default: model.type = e_model::MODEL_UNKNOWN;
}
} break;
default: (void)0;
};
@ -1926,7 +1956,6 @@ static void llm_load_tensors(
const int64_t n_vocab = hparams.n_vocab;
const auto tn = LLM_TN(model.arch);
switch (model.arch) {
case LLM_ARCH_LLAMA:
{
@ -1969,6 +1998,72 @@ static void llm_load_tensors(
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
auto & layer = model.layers[i];
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
if (backend == GGML_BACKEND_GPU) {
vram_weights +=
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
}
}
} break;
case LLM_ARCH_BAICHUAN:
{
model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
{
ggml_backend backend_norm;
ggml_backend backend_output;
if (n_gpu_layers > int(n_layer)) {
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
// on Windows however this is detrimental unless everything is on the GPU
#ifndef _WIN32
backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
#else
backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
#endif // _WIN32
backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
} else {
backend_norm = GGML_BACKEND_CPU;
backend_output = GGML_BACKEND_CPU;
}
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
if (backend_norm == GGML_BACKEND_GPU) {
vram_weights += ggml_nbytes(model.output_norm);
}
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
vram_weights += ggml_nbytes(model.output);
}
}
const uint32_t n_ff = hparams.n_ff;
const int i_gpu_start = n_layer - n_gpu_layers;
model.layers.resize(n_layer);
for (uint32_t i = 0; i < n_layer; ++i) {
const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
@ -2545,6 +2640,367 @@ static struct ggml_cgraph * llm_build_llama(
return gf;
}
static struct ggml_cgraph * llm_build_baichaun(
llama_context & lctx,
const llama_token * tokens,
const float * embd,
int n_tokens,
int n_past) {
GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
const int N = n_tokens;
const auto & model = lctx.model;
const auto & hparams = model.hparams;
const auto & kv_self = lctx.kv_self;
GGML_ASSERT(!!kv_self.ctx);
const int64_t n_embd = hparams.n_embd;
const int64_t n_layer = hparams.n_layer;
const int64_t n_ctx = hparams.n_ctx;
const int64_t n_head = hparams.n_head;
const int64_t n_head_kv = hparams.n_head_kv;
const int64_t n_embd_head = hparams.n_embd_head();
const int64_t n_embd_gqa = hparams.n_embd_gqa();
GGML_ASSERT(n_embd_head == hparams.n_rot);
const float freq_base = hparams.rope_freq_base;
const float freq_scale = hparams.rope_freq_scale;
const float norm_rms_eps = hparams.f_norm_rms_eps;
const int n_gpu_layers = model.n_gpu_layers;
auto & buf_compute = lctx.buf_compute;
struct ggml_init_params params = {
/*.mem_size =*/ buf_compute.size,
/*.mem_buffer =*/ buf_compute.data,
/*.no_alloc =*/ false,
};
params.no_alloc = true;
struct ggml_context * ctx0 = ggml_init(params);
ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * cur;
struct ggml_tensor * inpL;
if (tokens) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_allocr_alloc(lctx.alloc, inp_tokens);
if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
}
ggml_set_name(inp_tokens, "inp_tokens");
inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
} else {
#ifdef GGML_USE_MPI
GGML_ASSERT(false && "not implemented");
#endif
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
ggml_allocr_alloc(lctx.alloc, inpL);
if (!ggml_allocr_is_measure(lctx.alloc)) {
memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
}
}
const int i_gpu_start = n_layer - n_gpu_layers;
(void) i_gpu_start;
// offload functions set the tensor output backend to GPU
// tensors are GPU-accelerated if any input or the output has been offloaded
//
// with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
// in that case ggml_cuda_assign_buffers has no effect
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
offload_func_t offload_func_kq = llama_nop;
offload_func_t offload_func_v = llama_nop;
#ifdef GGML_USE_CUBLAS
if (n_gpu_layers > n_layer) {
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
}
if (n_gpu_layers > n_layer + 1) {
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
}
if (n_gpu_layers > n_layer + 2) {
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
}
#endif // GGML_USE_CUBLAS
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(lctx.alloc, KQ_scale);
if (!ggml_allocr_is_measure(lctx.alloc)) {
ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
}
ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
for (int il = 0; il < n_layer; ++il) {
ggml_format_name(inpL, "layer_inp_%d", il);
offload_func_t offload_func = llama_nop;
#ifdef GGML_USE_CUBLAS
if (il >= i_gpu_start) {
offload_func = ggml_cuda_assign_buffers_no_alloc;
}
#endif // GGML_USE_CUBLAS
struct ggml_tensor * inpSA = inpL;
// norm
{
cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
offload_func(cur);
ggml_set_name(cur, "rms_norm_0");
// cur = cur*attn_norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
offload_func(cur);
ggml_set_name(cur, "attention_norm_0");
}
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
offload_func_kq(tmpk);
ggml_set_name(tmpk, "tmpk");
struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
offload_func_kq(tmpq);
ggml_set_name(tmpq, "tmpq");
struct ggml_tensor * Kcur;
struct ggml_tensor * Qcur;
switch (model.type) {
case MODEL_7B:
Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
break;
case MODEL_13B:
Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
break;
default:
GGML_ASSERT(false);
}
offload_func_kq(Kcur);
ggml_set_name(Kcur, "Kcur");
offload_func_kq(Qcur);
ggml_set_name(Qcur, "Qcur");
// store key and value to memory
{
// compute the transposed [N, n_embd] V matrix
struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
offload_func_v(tmpv);
ggml_set_name(tmpv, "tmpv");
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
offload_func_v(Vcur);
ggml_set_name(Vcur, "Vcur");
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
offload_func_kq(k);
ggml_set_name(k, "k");
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
offload_func_v(v);
ggml_set_name(v, "v");
// important: storing RoPE-ed version of K in the KV cache!
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
}
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
offload_func_kq(Q);
ggml_set_name(Q, "Q");
struct ggml_tensor * K =
ggml_view_3d(ctx0, kv_self.k,
n_embd_head, n_past + N, n_head_kv,
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
offload_func_kq(K);
ggml_set_name(K, "K");
// K * Q
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
offload_func_kq(KQ);
ggml_set_name(KQ, "KQ");
// KQ_scaled = KQ / sqrt(n_embd_head)
// KQ_scaled shape [n_past + N, N, n_head, 1]
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
offload_func_kq(KQ_scaled);
ggml_set_name(KQ_scaled, "KQ_scaled");
struct ggml_tensor * KQ_masked;
struct ggml_tensor * KQ_scaled_alibi;
switch (model.type) {
case MODEL_7B:
KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
break;
case MODEL_13B:
KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
break;
default:
GGML_ASSERT(false);
}
// KQ_masked = mask_past(KQ_scaled)
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
// struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
// offload_func_kq(KQ_masked);
// ggml_set_name(KQ_masked, "KQ_masked");
// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
offload_func_v(KQ_soft_max);
ggml_set_name(KQ_soft_max, "KQ_soft_max");
// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v,
n_past + N, n_embd_head, n_head_kv,
ggml_element_size(kv_self.v)*n_ctx,
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
offload_func_v(V);
ggml_set_name(V, "V");
#if 1
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
offload_func_v(KQV);
ggml_set_name(KQV, "KQV");
#else
// make V contiguous in memory to speed up the matmul, however we waste time on the copy
// on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
// is there a better way?
struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
#endif
// KQV_merged = KQV.permute(0, 2, 1, 3)
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
offload_func_v(KQV_merged);
ggml_set_name(KQV_merged, "KQV_merged");
// cur = KQV_merged.contiguous().view(n_embd, N)
cur = ggml_cpy(ctx0,
KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
offload_func_v(cur);
ggml_set_name(cur, "KQV_merged_contiguous");
// projection (no bias)
cur = ggml_mul_mat(ctx0,
model.layers[il].wo,
cur);
offload_func(cur);
ggml_set_name(cur, "result_wo");
}
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
offload_func(inpFF);
ggml_set_name(inpFF, "inpFF");
// feed-forward network
{
// norm
{
cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
offload_func(cur);
ggml_set_name(cur, "rms_norm_1");
// cur = cur*ffn_norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
offload_func(cur);
ggml_set_name(cur, "ffn_norm");
}
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
model.layers[il].w3,
cur);
offload_func(tmp);
ggml_set_name(tmp, "result_w3");
cur = ggml_mul_mat(ctx0,
model.layers[il].w1,
cur);
offload_func(cur);
ggml_set_name(cur, "result_w1");
// SILU activation
cur = ggml_silu(ctx0, cur);
offload_func(cur);
ggml_set_name(cur, "silu");
cur = ggml_mul(ctx0, cur, tmp);
offload_func(cur);
ggml_set_name(cur, "silu_x_result_w3");
cur = ggml_mul_mat(ctx0,
model.layers[il].w2,
cur);
offload_func(cur);
ggml_set_name(cur, "result_w2");
}
cur = ggml_add(ctx0, cur, inpFF);
offload_func(cur);
ggml_set_name(cur, "inpFF_+_result_w2");
// input for next layer
inpL = cur;
}
cur = inpL;
// norm
{
cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
offload_func_nr(cur);
ggml_set_name(cur, "rms_norm_2");
// cur = cur*norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.output_norm);
// offload_func_nr(cur); // TODO CPU + GPU mirrored backend
ggml_set_name(cur, "result_norm");
}
// lm_head
cur = ggml_mul_mat(ctx0, model.output, cur);
ggml_set_name(cur, "result_output");
ggml_build_forward_expand(gf, cur);
ggml_free(ctx0);
return gf;
}
static struct ggml_cgraph * llm_build_falcon(
llama_context & lctx,
const llama_token * tokens,
@ -2867,6 +3323,10 @@ static struct ggml_cgraph * llama_build_graph(
{
result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
} break;
case LLM_ARCH_BAICHUAN:
{
result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
} break;
case LLM_ARCH_FALCON:
{
result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
@ -2972,10 +3432,6 @@ static bool llama_eval_internal(
if (lctx.ctx_metal) {
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
ggml_metal_graph_compute(lctx.ctx_metal, gf);
ggml_metal_get_tensor (lctx.ctx_metal, res);
if (!lctx.embedding.empty()) {
ggml_metal_get_tensor(lctx.ctx_metal, embeddings);
}
} else {
ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
}
@ -3124,10 +3580,9 @@ struct llm_tokenizer_spm {
while (offs < text.size()) {
llm_symbol sym;
size_t len = utf8_len(text[offs]);
GGML_ASSERT(offs + len <= text.size());
sym.text = text.c_str() + offs;
sym.n = len;
offs += len;
sym.n = std::min(len, text.size() - offs);
offs += sym.n;
sym.prev = index - 1;
sym.next = offs == text.size() ? -1 : index + 1;
index++;
@ -4643,7 +5098,16 @@ void llama_beam_search(llama_context * ctx,
// quantization
//
static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
template <typename T>
struct no_init {
T value;
no_init() { /* do nothing */ }
};
static void llama_convert_tensor_internal(
struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
const size_t nelements, const int nthread
) {
if (output.size() < nelements) {
output.resize(nelements);
}
@ -4678,7 +5142,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
auto blocks_per_thread = nblocks / nthread;
auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
std::vector<std::thread> workers;
for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@ -4691,14 +5154,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
qtype.to_float(inbuf, outbuf, nels);
}
};
workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
in_buff_offs += thr_block_bytes;
out_buff_offs += thr_elems;
}
for (auto & worker : workers) {
worker.join();
for (auto & w : workers) { w.join(); }
workers.clear();
}
#ifdef GGML_USE_K_QUANTS
static ggml_type get_k_quant_type(
ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
) {
const std::string name = ggml_get_name(tensor);
// TODO: avoid hardcoded tensor names - use the TN_* constants
const auto tn = LLM_TN(model.arch);
auto use_more_bits = [](int i_layer, int num_layers) -> bool {
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
};
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
int nx = tensor->ne[0];
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0;
}
else if (new_type != GGML_TYPE_Q8_0) {
new_type = GGML_TYPE_Q6_K;
}
} else if (name.find("attn_v.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
(*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
if (model.type == MODEL_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
}
++*i_attention_wv;
} else if (name.find("ffn_down.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
: model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
: GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
if (model.arch == LLM_ARCH_FALCON) {
new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
} else {
if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
}
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
new_type = GGML_TYPE_Q5_K;
}
++*i_feed_forward_w2;
} else if (name.find("attn_output.weight") != std::string::npos) {
if (model.arch != LLM_ARCH_FALCON) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
}
}
else if (name.find("attn_qkv.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
}
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
}
// This can be used to reduce the size of the Q5_K_S model.
// The associated PPL increase is fully in line with the size reduction
//else {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
//}
bool convert_incompatible_tensor = false;
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
int nx = tensor->ne[0];
int ny = tensor->ne[1];
if (nx % QK_K != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
convert_incompatible_tensor = true;
}
}
if (convert_incompatible_tensor) {
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
} else {
throw std::runtime_error("Unsupported tensor size encountered\n");
}
}
return new_type;
}
#endif
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type quantized_type;
@ -4783,18 +5355,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
std::vector<int64_t> hist_all(1 << 4, 0);
std::vector<std::thread> workers;
workers.reserve(nthread);
std::mutex mutex;
#ifdef GGML_USE_K_QUANTS
auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
};
#endif
int idx = 0;
std::vector<uint8_t> read_data;
std::vector<uint8_t> work;
std::vector<no_init<uint8_t>> read_data;
std::vector<no_init<uint8_t>> work;
std::vector<no_init<float>> f32_conv_buf;
// populate the original tensors so we get an initial meta data
for (int i = 0; i < ml->n_tensors; ++i) {
@ -4816,7 +5384,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
const std::string name = ggml_get_name(tensor);
if (read_data.size() < ggml_nbytes(tensor)) {
read_data.resize(ggml_nbytes(tensor));
}
tensor->data = read_data.data();
ml->load_data_for(tensor);
@ -4841,101 +5411,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (quantize) {
new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS
// TODO: avoid hardcoded tensor names - use the TN_* constants
const auto tn = LLM_TN(ml->get_arch());
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
int nx = tensor->ne[0];
if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0;
}
else if (new_type != GGML_TYPE_Q8_0) {
new_type = GGML_TYPE_Q6_K;
}
} else if (name.find("attn_v.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
if (model.type == MODEL_70B) {
// In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
// 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
// nearly negligible increase in model size by quantizing this tensor with more bits:
if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
}
++i_attention_wv;
} else if (name.find("ffn_down.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
: model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
: GGML_TYPE_Q3_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
if (model.arch == LLM_ARCH_FALCON) {
new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
} else {
if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
}
}
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
new_type = GGML_TYPE_Q5_K;
}
++i_feed_forward_w2;
} else if (name.find("attn_output.weight") != std::string::npos) {
if (model.arch != LLM_ARCH_FALCON) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
} else {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
}
}
else if (name.find("attn_qkv.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
}
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
}
// This can be used to reduce the size of the Q5_K_S model.
// The associated PPL increase is fully in line with the size reduction
//else {
// if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
//}
bool convert_incompatible_tensor = false;
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
int nx = tensor->ne[0];
int ny = tensor->ne[1];
if (nx % QK_K != 0) {
LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
convert_incompatible_tensor = true;
}
}
if (convert_incompatible_tensor) {
if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
} else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
} else {
throw std::runtime_error("Unsupported tensor size encountered\n");
}
}
new_type = get_k_quant_type(
new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
);
#endif
// If we've decided to quantize to the same type the tensor is already
// in then there's nothing to do.
@ -4950,23 +5428,24 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
const size_t nelements = ggml_nelements(tensor);
float * f32_data;
std::vector<float> f32_conv_buf;
if (tensor->type == GGML_TYPE_F32) {
f32_data = (float *) tensor->data;
} else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
} else {
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
f32_data = (float *) f32_conv_buf.data();
}
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
fflush(stdout);
if (work.size() < nelements * 4) {
work.resize(nelements * 4); // upper bound on size
}
new_data = work.data();
std::vector<int64_t> hist_cur(1 << 4, 0);
std::array<int64_t, 1 << 4> hist_cur = {};
static const int chunk_size = 32 * 512;
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
@ -4977,13 +5456,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
size_t counter = 0;
new_size = 0;
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
std::vector<int64_t> local_hist;
std::array<int64_t, 1 << 4> local_hist = {};
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
size_t first = counter; counter += chunk_size;
if (first >= nelements) {
if (!local_hist.empty()) {
if (local_size > 0) {
for (int j=0; j<int(local_hist.size()); ++j) {
hist_cur[j] += local_hist[j];
}
@ -4993,22 +5472,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
}
lock.unlock();
size_t last = std::min(nelements, first + chunk_size);
if (local_hist.empty()) {
local_hist.resize(hist_cur.size(), 0);
}
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
}
};
if ((int) workers.size() < nthread_use - 1) {
workers.resize(nthread_use - 1);
}
for (int it = 0; it < nthread_use - 1; ++it) {
workers[it] = std::thread(compute);
workers.emplace_back(compute);
}
compute();
for (int it = 0; it < nthread_use - 1; ++it) {
workers[it].join();
}
for (auto & w : workers) { w.join(); }
workers.clear();
}
LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
@ -6222,7 +6694,7 @@ int llama_tokenize_with_model(
auto res = llama_tokenize_internal(model->vocab, text, add_bos);
if (n_max_tokens < (int) res.size()) {
LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
// LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
return -((int) res.size());
}

View file

@ -0,0 +1,4 @@
以下内容为人类用户与与一位智能助手的对话。
用户:你好!
助手:

View file

@ -13,7 +13,7 @@ CLI_ARGS_MAIN_PERPLEXITY = [
"hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
"interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
"low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
"model", "mtest", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
"model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
"np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
"prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
"repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",

View file

@ -0,0 +1,69 @@
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
set(LLAMA_BLAS @LLAMA_BLAS@)
set(LLAMA_CUBLAS @LLAMA_CUBLAS@)
set(LLAMA_METAL @LLAMA_METAL@)
set(LLAMA_MPI @LLAMA_MPI@)
set(LLAMA_CLBLAST @LLAMA_CLBLAST@)
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
@PACKAGE_INIT@
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
# Ensure transient dependencies satisfied
find_package(Threads REQUIRED)
if (APPLE AND LLAMA_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
endif()
if (LLAMA_BLAS)
find_package(BLAS REQUIRED)
endif()
if (LLAMA_CUBLAS)
find_package(CUDAToolkit REQUIRED)
endif()
if (LLAMA_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
endif()
if (LLAMA_MPI)
find_package(MPI REQUIRED)
endif()
if (LLAMA_CLBLAST)
find_package(CLBlast REQUIRED)
endif()
if (LLAMA_HIPBLAS)
find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
endif()
find_library(llama_LIBRARY llama
REQUIRED
HINTS ${LLAMA_LIB_DIR})
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
add_library(llama UNKNOWN IMPORTED)
set_target_properties(llama
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
IMPORTED_LOCATION "${llama_LIBRARY}"
INTERFACE_COMPILE_FEATURES cxx_std_11
POSITION_INDEPENDENT_CODE ON )
check_required_components(Llama)

View file

@ -29,9 +29,8 @@ llama_build_executable(test-tokenizer-0-llama.cpp)
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_build_executable(test-tokenizer-0-falcon.cpp)
#llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_build_executable(test-tokenizer-1.cpp)
# test-tokenizer-1 requires a BPE vocab. re-enable when we have one.
#llama_test_executable (test-tokenizer-1.llama test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_build_executable(test-tokenizer-1-llama.cpp)
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
llama_build_and_test_executable(test-grammar-parser.cpp)
llama_build_and_test_executable(test-llama-grammar.cpp)

View file

@ -1,5 +1,6 @@
#include "llama.h"
#include "common.h"
#include "console.h"
#include <cstdio>
#include <string>
@ -89,6 +90,12 @@ int main(int argc, char **argv) {
return 2;
}
#ifdef _WIN32
// We need this for unicode console support
console::init(false, false);
atexit([]() { console::cleanup(); });
#endif
bool success = true;
for (const auto & test_kv : k_tests()) {

View file

@ -0,0 +1,127 @@
#include "llama.h"
#include "common.h"
#include "console.h"
#include <cassert>
#include <cstdio>
#include <cstring>
#include <string>
#include <codecvt>
#include <map>
#include <vector>
#include <locale>
typedef int codepoint;
std::string codepoint_to_utf8(codepoint cp) {
std::string result;
if (0x00 <= cp && cp <= 0x7f) {
result.push_back(cp);
} else if (0x80 <= cp && cp <= 0x7ff) {
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
result.push_back(0x80 | (cp & 0x3f));
} else if (0x800 <= cp && cp <= 0xffff) {
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
result.push_back(0x80 | ((cp >> 6) & 0x3f));
result.push_back(0x80 | (cp & 0x3f));
} else if (0x10000 <= cp && cp <= 0x10ffff) {
result.push_back(0xf0 | ((cp >> 18) & 0x07));
result.push_back(0x80 | ((cp >> 12) & 0x3f));
result.push_back(0x80 | ((cp >> 6) & 0x3f));
result.push_back(0x80 | (cp & 0x3f));
} else {
throw std::invalid_argument("invalid codepoint");
}
return result;
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
return 1;
}
const std::string fname = argv[1];
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
llama_model * model;
llama_context * ctx;
llama_backend_init(false);
// load the vocab
{
auto lparams = llama_context_default_params();
lparams.vocab_only = true;
model = llama_load_model_from_file(fname.c_str(), lparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
return 1;
}
ctx = llama_new_context_with_model(model, lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
llama_free_model(model);
return 1;
}
}
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM);
#ifdef _WIN32
// We need this for unicode console support
console::init(false, false);
atexit([]() { console::cleanup(); });
#endif
const int n_vocab = llama_n_vocab(ctx);
for (int i = 0; i < n_vocab; ++i) {
std::string str = llama_detokenize_spm(ctx, std::vector<int>(1, i));
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens);
if (check != str) {
fprintf(stderr, "%s : error: token %d detokenizes to >%s<(%llu) but tokenization of this detokenizes to >%s<(%llu)\n",
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
if(i != 3)
return 2;
}
}
for (codepoint cp = 0x0000; cp < 0xffff; ++cp) {
if (cp < 0xd800 || cp > 0xdfff) {
std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens);
if (str != check) {
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
if(cp != 0 && cp != 9601)
return 3;
}
}
}
for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) {
std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens);
if (str != check) {
fprintf(stderr, "%s : error: codepoint %d detokenizes to >%s<(%llu) instead of >%s<(%llu)\n",
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
return 4;
}
}
llama_free_model(model);
llama_free(ctx);
llama_backend_free();
return 0;
}

View file

@ -1,108 +0,0 @@
#include "llama.h"
#include "common.h"
#include <cassert>
#include <cstdio>
#include <cstring>
#include <string>
#include <codecvt>
#include <map>
#include <vector>
#include <locale>
static std::string escape_whitespace(const std::string& text) {
std::string result = "\xe2\x96\x81";
for (size_t offs = 0; offs < text.length(); ++offs) {
if (text[offs] == ' ') {
result += "\xe2\x96\x81";
} else {
result += text[offs];
}
}
return result;
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
return 1;
}
const std::string fname = argv[1];
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
llama_model * model;
llama_context * ctx;
llama_backend_init(false);
// load the vocab
{
auto lparams = llama_context_default_params();
lparams.vocab_only = true;
model = llama_load_model_from_file(fname.c_str(), lparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
return 1;
}
ctx = llama_new_context_with_model(model, lparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
llama_free_model(model);
return 1;
}
}
GGML_ASSERT(llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_BPE);
const int n_vocab = llama_n_vocab(ctx);
for (int i = 0; i < n_vocab; ++i) {
std::string forward = llama_token_to_piece(ctx, i);
std::vector<llama_token> tokens = llama_tokenize(ctx, forward, false);
if (tokens.size() == 1) {
if (i != tokens[0]) {
std::string backward = llama_token_to_piece(ctx, tokens[0]);
fprintf(stderr, "%s : error: token %d is string %s but bpe returns token %d %s\n",
__func__, i, llama_token_to_piece(ctx, i).c_str(), tokens[0], backward.c_str());
return 2;
}
}
}
#ifdef _WIN32
std::wstring_convert<typename std::codecvt_utf8<char16_t>, char16_t> u16converter;
for (char16_t ch = 0x0000; ch < 0xffff; ++ch) {
std::u16string u16str(1, ch);
std::string str = u16converter.to_bytes(u16str);
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
if (tokens.size() == 1) {
fprintf(stderr, "%s : info: %s tokenized to %d \n",
__func__, str.c_str(), tokens[0]);
}
}
std::wstring_convert<typename std::codecvt_utf8<char32_t>, char32_t> u32converter;
for (char32_t ch = 0x0000; ch < 0x0010ffff; ++ch) {
std::u32string u32str(1, ch);
std::string str = u32converter.to_bytes(u32str);
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false);
if (tokens.size() == 1) {
fprintf(stderr, "%s : info: %s tokenized to %d \n", __func__, str.c_str(), tokens[0]);
}
}
#endif
llama_free_model(model);
llama_free(ctx);
llama_backend_free();
return 0;
}