Merge branch 'master' into gptq
This commit is contained in:
commit
793fc301c9
23 changed files with 1306 additions and 508 deletions
|
@ -6,7 +6,7 @@ RUN apt-get update && \
|
||||||
apt-get install -y build-essential python3 python3-pip
|
apt-get install -y build-essential python3 python3-pip
|
||||||
|
|
||||||
RUN pip install --upgrade pip setuptools wheel \
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
&& pip install torch torchvision torchaudio sentencepiece numpy
|
&& pip install numpy requests sentencepiece torch tqdm
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
@ -14,4 +14,4 @@ COPY . .
|
||||||
|
|
||||||
RUN make
|
RUN make
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||||
|
|
3
.github/workflows/build.yml
vendored
3
.github/workflows/build.yml
vendored
|
@ -54,6 +54,7 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
cmake ..
|
cmake ..
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
|
ctest --output-on-failure
|
||||||
|
|
||||||
macOS-latest-make:
|
macOS-latest-make:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
@ -90,6 +91,7 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
cmake ..
|
cmake ..
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
|
ctest --output-on-failure
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
|
@ -106,6 +108,7 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
cmake ..
|
cmake ..
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
|
ctest --output-on-failure
|
||||||
|
|
||||||
- name: Get commit hash
|
- name: Get commit hash
|
||||||
id: commit
|
id: commit
|
||||||
|
|
2
.github/workflows/docker.yml
vendored
2
.github/workflows/docker.yml
vendored
|
@ -40,7 +40,7 @@ jobs:
|
||||||
uses: docker/login-action@v2
|
uses: docker/login-action@v2
|
||||||
with:
|
with:
|
||||||
registry: ghcr.io
|
registry: ghcr.io
|
||||||
username: ${{ github.actor }}
|
username: ${{ github.repository_owner }}
|
||||||
password: ${{ secrets.GITHUB_TOKEN }}
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
- name: Build and push Docker image (versioned)
|
- name: Build and push Docker image (versioned)
|
||||||
|
|
287
CMakeLists.txt
287
CMakeLists.txt
|
@ -1,131 +1,252 @@
|
||||||
cmake_minimum_required(VERSION 3.8)
|
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
|
||||||
project("llama.cpp")
|
project("llama.cpp" C CXX)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 20)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
|
||||||
set(CMAKE_C_STANDARD 11)
|
|
||||||
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
|
||||||
find_package(Threads REQUIRED)
|
|
||||||
|
|
||||||
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
||||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||||
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
|
|
||||||
|
|
||||||
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
|
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||||
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
|
set(LLAMA_STANDALONE ON)
|
||||||
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
|
||||||
|
|
||||||
if (APPLE)
|
# configure project version
|
||||||
option(LLAMA_NO_ACCELERATE "llama: disable Accelerate framework" OFF)
|
# TODO
|
||||||
option(LLAMA_NO_AVX "llama: disable AVX" OFF)
|
else()
|
||||||
option(LLAMA_NO_AVX2 "llama: disable AVX2" OFF)
|
set(LLAMA_STANDALONE OFF)
|
||||||
option(LLAMA_NO_FMA "llama: disable FMA" OFF)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (EMSCRIPTEN)
|
||||||
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||||
|
|
||||||
|
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
|
||||||
|
else()
|
||||||
|
if (MINGW)
|
||||||
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||||
|
else()
|
||||||
|
set(BUILD_SHARED_LIBS_DEFAULT ON)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Option list
|
||||||
|
#
|
||||||
|
|
||||||
|
# general
|
||||||
|
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||||
|
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
|
||||||
|
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
||||||
|
|
||||||
|
# debug
|
||||||
|
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
||||||
|
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
|
||||||
|
option(LLAMA_GPROF "llama: enable gprof" OFF)
|
||||||
|
|
||||||
|
# sanitizers
|
||||||
|
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
|
||||||
|
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
|
||||||
|
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
||||||
|
|
||||||
|
# instruction set specific
|
||||||
|
option(LLAMA_AVX "llama: enable AVX" ON)
|
||||||
|
option(LLAMA_AVX2 "llama: enable AVX2" ON)
|
||||||
|
option(LLAMA_FMA "llama: enable FMA" ON)
|
||||||
|
|
||||||
|
# 3rd party libs
|
||||||
|
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
|
||||||
|
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
|
||||||
|
|
||||||
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
|
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||||
|
|
||||||
|
#
|
||||||
|
# Compile flags
|
||||||
|
#
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||||
|
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||||
|
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
if (LLAMA_SANITIZE_THREAD)
|
if (LLAMA_SANITIZE_THREAD)
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread")
|
add_compile_options(-fsanitize=thread)
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_SANITIZE_ADDRESS)
|
if (LLAMA_SANITIZE_ADDRESS)
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
|
add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_SANITIZE_UNDEFINED)
|
if (LLAMA_SANITIZE_UNDEFINED)
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined")
|
add_compile_options(-fsanitize=undefined)
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
|
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (APPLE AND NOT LLAMA_NO_ACCELERATE)
|
if (APPLE AND LLAMA_ACCELERATE)
|
||||||
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
||||||
if (ACCELERATE_FRAMEWORK)
|
if (ACCELERATE_FRAMEWORK)
|
||||||
message(STATUS "Accelerate framework found")
|
message(STATUS "Accelerate framework found")
|
||||||
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
|
add_compile_definitions(GGML_USE_ACCELERATE)
|
||||||
set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
|
||||||
else()
|
else()
|
||||||
message(WARNING "Accelerate framework not found")
|
message(WARNING "Accelerate framework not found")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_OPENBLAS)
|
||||||
|
if (LLAMA_STATIC)
|
||||||
|
set(BLA_STATIC ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(BLA_VENDOR OpenBLAS)
|
||||||
|
find_package(BLAS)
|
||||||
|
if (BLAS_FOUND)
|
||||||
|
message(STATUS "OpenBLAS found")
|
||||||
|
|
||||||
|
add_compile_definitions(GGML_USE_OPENBLAS)
|
||||||
|
add_link_options(${BLAS_LIBRARIES})
|
||||||
|
else()
|
||||||
|
message(WARNING "OpenBLAS not found")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
if (LLAMA_ALL_WARNINGS)
|
if (LLAMA_ALL_WARNINGS)
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
|
set(c_flags
|
||||||
-Wall \
|
-Wall
|
||||||
-Wextra \
|
-Wextra
|
||||||
-Wpedantic \
|
-Wpedantic
|
||||||
-Wshadow \
|
-Wshadow
|
||||||
-Wcast-qual \
|
-Wcast-qual
|
||||||
-Wstrict-prototypes \
|
-Wstrict-prototypes
|
||||||
-Wpointer-arith \
|
-Wpointer-arith
|
||||||
-Wno-unused-function \
|
-Wno-unused-function
|
||||||
")
|
)
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
|
set(cxx_flags
|
||||||
-Wall \
|
-Wall
|
||||||
-Wextra \
|
-Wextra
|
||||||
-Wpedantic \
|
-Wpedantic
|
||||||
-Wcast-qual \
|
-Wcast-qual
|
||||||
")
|
)
|
||||||
else()
|
else()
|
||||||
# todo : msvc
|
# todo : msvc
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
add_compile_options(
|
||||||
|
"$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
|
||||||
|
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
|
||||||
|
)
|
||||||
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
if (LLAMA_LTO)
|
||||||
|
include(CheckIPOSupported)
|
||||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
|
check_ipo_supported(RESULT result OUTPUT output)
|
||||||
message(STATUS "ARM detected")
|
if (result)
|
||||||
else()
|
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
|
||||||
message(STATUS "x86 detected")
|
|
||||||
if (MSVC)
|
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
|
|
||||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
|
|
||||||
else()
|
else()
|
||||||
if(NOT LLAMA_NO_AVX)
|
message(WARNING "IPO is not supported: ${output}")
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
|
|
||||||
endif()
|
|
||||||
if(NOT LLAMA_NO_AVX2)
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
|
|
||||||
endif()
|
|
||||||
if(NOT LLAMA_NO_FMA)
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
|
|
||||||
endif()
|
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
|
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# if (LLAMA_PERF)
|
# Architecture specific
|
||||||
# set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF)
|
# TODO: probably these flags need to be tweaked on some architectures
|
||||||
# endif()
|
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||||
|
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||||
|
if (NOT MSVC)
|
||||||
|
if (LLAMA_STATIC)
|
||||||
|
add_link_options(-static)
|
||||||
|
if (MINGW)
|
||||||
|
add_link_options(-static-libgcc -static-libstdc++)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
if (LLAMA_GPROF)
|
||||||
|
add_compile_options(-pg)
|
||||||
|
endif()
|
||||||
|
if (LLAMA_NATIVE)
|
||||||
|
add_compile_options(-march=native)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
add_executable(llama
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
|
||||||
main.cpp
|
message(STATUS "ARM detected")
|
||||||
utils.cpp
|
if (MSVC)
|
||||||
utils.h)
|
# TODO: arm msvc?
|
||||||
|
else()
|
||||||
|
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
|
||||||
|
add_compile_options(-mcpu=native)
|
||||||
|
endif()
|
||||||
|
# TODO: armv6,7,8 version specific flags
|
||||||
|
endif()
|
||||||
|
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
|
||||||
|
message(STATUS "x86 detected")
|
||||||
|
if (MSVC)
|
||||||
|
if (LLAMA_AVX2)
|
||||||
|
add_compile_options(/arch:AVX2)
|
||||||
|
elseif (LLAMA_AVX)
|
||||||
|
add_compile_options(/arch:AVX)
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
add_compile_options(-mf16c)
|
||||||
|
if (LLAMA_FMA)
|
||||||
|
add_compile_options(-mfma)
|
||||||
|
endif()
|
||||||
|
if (LLAMA_AVX)
|
||||||
|
add_compile_options(-mavx)
|
||||||
|
endif()
|
||||||
|
if (LLAMA_AVX2)
|
||||||
|
add_compile_options(-mavx2)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
# TODO: support PowerPC
|
||||||
|
message(STATUS "Unknown architecture")
|
||||||
|
endif()
|
||||||
|
|
||||||
add_executable(quantize
|
|
||||||
quantize.cpp
|
|
||||||
utils.cpp
|
|
||||||
utils.h)
|
|
||||||
|
|
||||||
add_library(ggml
|
#
|
||||||
ggml.c
|
# Build library
|
||||||
ggml.h)
|
#
|
||||||
|
|
||||||
target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS})
|
add_executable(llama main.cpp)
|
||||||
target_compile_definitions(llama PUBLIC ${LLAMA_EXTRA_FLAGS})
|
|
||||||
target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS})
|
add_executable(quantize quantize.cpp)
|
||||||
|
|
||||||
|
add_library(utils OBJECT
|
||||||
|
utils.cpp
|
||||||
|
utils.h)
|
||||||
|
|
||||||
|
target_include_directories(utils PUBLIC .)
|
||||||
|
target_compile_features(utils PUBLIC cxx_std_11) # don't bump
|
||||||
|
|
||||||
|
add_library(ggml OBJECT
|
||||||
|
ggml.c
|
||||||
|
ggml.h)
|
||||||
|
|
||||||
target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
|
|
||||||
target_include_directories(ggml PUBLIC .)
|
target_include_directories(ggml PUBLIC .)
|
||||||
target_link_libraries(quantize PRIVATE ggml)
|
target_compile_features(ggml PUBLIC c_std_11) # don't bump
|
||||||
target_link_libraries(llama PRIVATE ggml)
|
|
||||||
target_link_libraries(ggml PRIVATE Threads::Threads)
|
#
|
||||||
|
# Linking
|
||||||
|
#
|
||||||
|
|
||||||
|
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
|
||||||
|
target_link_libraries(llama PRIVATE ggml utils)
|
||||||
|
target_link_libraries(quantize PRIVATE ggml utils)
|
||||||
|
|
||||||
|
#
|
||||||
|
# programs, examples and tests
|
||||||
|
#
|
||||||
|
|
||||||
|
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||||
|
enable_testing()
|
||||||
|
add_subdirectory(tests)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
#if (LLAMA_BUILD_EXAMPLES)
|
||||||
|
# add_subdirectory(examples)
|
||||||
|
#endif()
|
||||||
|
|
63
Makefile
63
Makefile
|
@ -17,7 +17,7 @@ CXXV := $(shell $(CXX) --version | head -n 1)
|
||||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
ifneq ($(UNAME_P),arm)
|
ifneq ($(UNAME_P),arm)
|
||||||
SYSCTL_M := $(shell sysctl -n hw.optional.arm64)
|
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
|
||||||
ifeq ($(SYSCTL_M),1)
|
ifeq ($(SYSCTL_M),1)
|
||||||
# UNAME_P := arm
|
# UNAME_P := arm
|
||||||
# UNAME_M := arm64
|
# UNAME_M := arm64
|
||||||
|
@ -30,6 +30,7 @@ endif
|
||||||
# Compile flags
|
# Compile flags
|
||||||
#
|
#
|
||||||
|
|
||||||
|
# keep standard at C11 and C++11
|
||||||
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
|
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
|
||||||
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
|
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
|
||||||
LDFLAGS =
|
LDFLAGS =
|
||||||
|
@ -52,6 +53,10 @@ ifeq ($(UNAME_S),NetBSD)
|
||||||
CFLAGS += -pthread
|
CFLAGS += -pthread
|
||||||
CXXFLAGS += -pthread
|
CXXFLAGS += -pthread
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(UNAME_S),OpenBSD)
|
||||||
|
CFLAGS += -pthread
|
||||||
|
CXXFLAGS += -pthread
|
||||||
|
endif
|
||||||
ifeq ($(UNAME_S),Haiku)
|
ifeq ($(UNAME_S),Haiku)
|
||||||
CFLAGS += -pthread
|
CFLAGS += -pthread
|
||||||
CXXFLAGS += -pthread
|
CXXFLAGS += -pthread
|
||||||
|
@ -95,30 +100,59 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
||||||
ifneq (,$(findstring sse3,$(SSE3_M)))
|
ifneq (,$(findstring sse3,$(SSE3_M)))
|
||||||
CFLAGS += -msse3
|
CFLAGS += -msse3
|
||||||
endif
|
endif
|
||||||
|
AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
|
||||||
|
ifneq (,$(findstring avx512f,$(AVX512F_M)))
|
||||||
|
CFLAGS += -mavx512f
|
||||||
|
endif
|
||||||
|
AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
|
||||||
|
ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
|
||||||
|
CFLAGS += -mavx512bw
|
||||||
|
endif
|
||||||
|
AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
|
||||||
|
ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
|
||||||
|
CFLAGS += -mavx512dq
|
||||||
|
endif
|
||||||
|
AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
|
||||||
|
ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
|
||||||
|
CFLAGS += -mavx512vl
|
||||||
|
endif
|
||||||
|
AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
|
||||||
|
ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
|
||||||
|
CFLAGS += -mavx512cd
|
||||||
|
endif
|
||||||
|
AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
|
||||||
|
ifneq (,$(findstring avx512er,$(AVX512ER_M)))
|
||||||
|
CFLAGS += -mavx512er
|
||||||
|
endif
|
||||||
|
AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
|
||||||
|
ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
|
||||||
|
CFLAGS += -mavx512ifma
|
||||||
|
endif
|
||||||
|
AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
|
||||||
|
ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
|
||||||
|
CFLAGS += -mavx512pf
|
||||||
|
endif
|
||||||
else ifeq ($(UNAME_S),Haiku)
|
else ifeq ($(UNAME_S),Haiku)
|
||||||
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
|
AVX1_M := $(shell sysinfo -cpu | grep -w "AVX")
|
||||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
ifneq (,$(findstring AVX,$(AVX1_M)))
|
||||||
CFLAGS += -mavx
|
CFLAGS += -mavx
|
||||||
endif
|
endif
|
||||||
AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
|
AVX2_M := $(shell sysinfo -cpu | grep -w "AVX2")
|
||||||
ifneq (,$(findstring avx2,$(AVX2_M)))
|
ifneq (,$(findstring AVX2,$(AVX2_M)))
|
||||||
CFLAGS += -mavx2
|
CFLAGS += -mavx2
|
||||||
endif
|
endif
|
||||||
FMA_M := $(shell sysinfo -cpu | grep "FMA ")
|
FMA_M := $(shell sysinfo -cpu | grep -w "FMA")
|
||||||
ifneq (,$(findstring fma,$(FMA_M)))
|
ifneq (,$(findstring FMA,$(FMA_M)))
|
||||||
CFLAGS += -mfma
|
CFLAGS += -mfma
|
||||||
endif
|
endif
|
||||||
F16C_M := $(shell sysinfo -cpu | grep "F16C ")
|
F16C_M := $(shell sysinfo -cpu | grep -w "F16C")
|
||||||
ifneq (,$(findstring f16c,$(F16C_M)))
|
ifneq (,$(findstring F16C,$(F16C_M)))
|
||||||
CFLAGS += -mf16c
|
CFLAGS += -mf16c
|
||||||
endif
|
endif
|
||||||
else
|
else
|
||||||
CFLAGS += -mfma -mf16c -mavx -mavx2
|
CFLAGS += -mfma -mf16c -mavx -mavx2
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
ifeq ($(UNAME_M),amd64)
|
|
||||||
CFLAGS += -mavx -mavx2 -mfma -mf16c
|
|
||||||
endif
|
|
||||||
ifneq ($(filter ppc64%,$(UNAME_M)),)
|
ifneq ($(filter ppc64%,$(UNAME_M)),)
|
||||||
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
|
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
|
||||||
ifneq (,$(findstring POWER9,$(POWER9_M)))
|
ifneq (,$(findstring POWER9,$(POWER9_M)))
|
||||||
|
@ -130,7 +164,8 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
ifndef LLAMA_NO_ACCELERATE
|
ifndef LLAMA_NO_ACCELERATE
|
||||||
# Mac M1 - include Accelerate framework
|
# Mac M1 - include Accelerate framework.
|
||||||
|
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
|
||||||
ifeq ($(UNAME_S),Darwin)
|
ifeq ($(UNAME_S),Darwin)
|
||||||
CFLAGS += -DGGML_USE_ACCELERATE
|
CFLAGS += -DGGML_USE_ACCELERATE
|
||||||
LDFLAGS += -framework Accelerate
|
LDFLAGS += -framework Accelerate
|
||||||
|
@ -193,7 +228,7 @@ clean:
|
||||||
|
|
||||||
main: main.cpp ggml.o utils.o
|
main: main.cpp ggml.o utils.o
|
||||||
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
|
||||||
./main -h
|
@echo "\x1b[36mrun ./main -h for help\x1b[0m"
|
||||||
|
|
||||||
quantize: quantize.cpp ggml.o utils.o
|
quantize: quantize.cpp ggml.o utils.o
|
||||||
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
|
||||||
|
|
73
README.md
73
README.md
|
@ -7,10 +7,13 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
**Hot topics:**
|
**Hot topics:**
|
||||||
|
|
||||||
- RMSNorm implementation / fixes: https://github.com/ggerganov/llama.cpp/issues/173
|
- [Added Alpaca support](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
|
||||||
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
|
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
|
||||||
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
|
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
|
||||||
|
|
||||||
|
**TEMPORARY NOTICE:**
|
||||||
|
If you're updating to the latest master, you will need to regenerate your model files as the format has changed.
|
||||||
|
|
||||||
## Description
|
## Description
|
||||||
|
|
||||||
The main goal is to run the model using 4-bit quantization on a MacBook
|
The main goal is to run the model using 4-bit quantization on a MacBook
|
||||||
|
@ -147,15 +150,27 @@ python3 -m pip install torch numpy sentencepiece
|
||||||
python3 convert-pth-to-ggml.py models/7B/ 1
|
python3 convert-pth-to-ggml.py models/7B/ 1
|
||||||
|
|
||||||
# quantize the model to 4-bits
|
# quantize the model to 4-bits
|
||||||
./quantize.sh 7B
|
python3 quantize.py 7B
|
||||||
|
|
||||||
# run the inference
|
# run the inference
|
||||||
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11.
|
||||||
|
|
||||||
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
When running the larger models, make sure you have enough disk space to store all the intermediate files.
|
||||||
|
|
||||||
TODO: add model disk/mem requirements
|
### Memory/Disk Requirements
|
||||||
|
|
||||||
|
As the models are currently fully loaded into memory, you will need adequate disk space to save them
|
||||||
|
and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
|
||||||
|
|
||||||
|
| model | original size | quantized size (4-bit) |
|
||||||
|
|-------|---------------|------------------------|
|
||||||
|
| 7B | 13 GB | 3.9 GB |
|
||||||
|
| 13B | 24 GB | 7.8 GB |
|
||||||
|
| 30B | 60 GB | 19.5 GB |
|
||||||
|
| 65B | 120 GB | 38.5 GB |
|
||||||
|
|
||||||
### Interactive mode
|
### Interactive mode
|
||||||
|
|
||||||
|
@ -163,22 +178,56 @@ If you want a more ChatGPT-like experience, you can run in interactive mode by p
|
||||||
In this mode, you can always interrupt generation by pressing Ctrl+C and enter one or more lines of text which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt which makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
|
In this mode, you can always interrupt generation by pressing Ctrl+C and enter one or more lines of text which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt which makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
|
||||||
|
|
||||||
Here is an example few-shot interaction, invoked with the command
|
Here is an example few-shot interaction, invoked with the command
|
||||||
```
|
|
||||||
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
|
|
||||||
-p \
|
|
||||||
"Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
|
||||||
|
|
||||||
User: Hello, Bob.
|
```bash
|
||||||
Bob: Hello. How may I help you today?
|
# default arguments using 7B model
|
||||||
User: Please tell me the largest city in Europe.
|
./chat.sh
|
||||||
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
|
||||||
User:"
|
|
||||||
|
|
||||||
|
# custom arguments using 13B model
|
||||||
|
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
Note the use of `--color` to distinguish between user input and generated text.
|
Note the use of `--color` to distinguish between user input and generated text.
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
### Instruction mode with Alpaca
|
||||||
|
|
||||||
|
First, download the `ggml` Alpaca model into the `./models` folder:
|
||||||
|
|
||||||
|
```
|
||||||
|
# use one of these
|
||||||
|
# TODO: add a script to simplify the download
|
||||||
|
curl -o ./models/ggml-alpaca-7b-q4.bin -C - https://gateway.estuary.tech/gw/ipfs/QmUp1UGeQFDqJKvtjbSYPBiZZKRjLp8shVP9hT8ZB9Ynv1
|
||||||
|
curl -o ./models/ggml-alpaca-7b-q4.bin -C - https://ipfs.io/ipfs/QmUp1UGeQFDqJKvtjbSYPBiZZKRjLp8shVP9hT8ZB9Ynv1
|
||||||
|
curl -o ./models/ggml-alpaca-7b-q4.bin -C - https://cloudflare-ipfs.com/ipfs/QmUp1UGeQFDqJKvtjbSYPBiZZKRjLp8shVP9hT8ZB9Ynv1
|
||||||
|
```
|
||||||
|
|
||||||
|
Now run the `main` tool like this:
|
||||||
|
|
||||||
|
```
|
||||||
|
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins
|
||||||
|
```
|
||||||
|
|
||||||
|
Sample run:
|
||||||
|
|
||||||
|
```
|
||||||
|
== Running in interactive mode. ==
|
||||||
|
- Press Ctrl+C to interject at any time.
|
||||||
|
- Press Return to return control to LLaMa.
|
||||||
|
- If you want to submit another line, end your input in '\'.
|
||||||
|
|
||||||
|
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
||||||
|
|
||||||
|
> How many letters are there in the English alphabet?
|
||||||
|
There 26 letters in the English Alphabet
|
||||||
|
> What is the most common way of transportation in Amsterdam?
|
||||||
|
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
|
||||||
|
> List 5 words that start with "ca".
|
||||||
|
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
|
||||||
|
>
|
||||||
|
```
|
||||||
|
|
||||||
### Android
|
### Android
|
||||||
|
|
||||||
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
|
You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
|
||||||
|
|
6
alpaca.sh
Executable file
6
alpaca.sh
Executable file
|
@ -0,0 +1,6 @@
|
||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Temporary script - will be removed in the future
|
||||||
|
#
|
||||||
|
|
||||||
|
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7
|
6
chat.sh
Executable file
6
chat.sh
Executable file
|
@ -0,0 +1,6 @@
|
||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Temporary script - will be removed in the future
|
||||||
|
#
|
||||||
|
|
||||||
|
./main -m ./models/7B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
|
|
@ -10,150 +10,104 @@
|
||||||
# - Name (char[name_length])
|
# - Name (char[name_length])
|
||||||
# - Data (float[n_dims])
|
# - Data (float[n_dims])
|
||||||
#
|
#
|
||||||
# By default, the bigger matrices are converted to 16-bit floats.
|
|
||||||
# This can be disabled by adding the "use-f32" CLI argument.
|
|
||||||
#
|
|
||||||
# At the start of the ggml file we write the model parameters
|
# At the start of the ggml file we write the model parameters
|
||||||
# and vocabulary.
|
# and vocabulary.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import struct
|
import struct
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from sentencepiece import SentencePieceProcessor
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
if len(sys.argv) < 3:
|
def parse_args():
|
||||||
print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
|
|
||||||
print(" ftype == 0 -> float32")
|
|
||||||
print(" ftype == 1 -> float16")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# output in the same directory as the model
|
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
|
||||||
dir_model = sys.argv[1]
|
parser.add_argument('dir_model', help='directory containing the model checkpoint')
|
||||||
|
parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
|
||||||
fname_hparams = sys.argv[1] + "/params.json"
|
parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
|
||||||
fname_tokenizer = sys.argv[1] + "/../tokenizer.model"
|
return parser.parse_args()
|
||||||
|
|
||||||
def get_n_parts(dim):
|
def get_n_parts(dim):
|
||||||
if dim == 4096:
|
|
||||||
return 1
|
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
|
||||||
elif dim == 5120:
|
n_parts = mappings.get(dim)
|
||||||
return 2
|
if n_parts is None:
|
||||||
elif dim == 6656:
|
print(f"Invalid dim: {dim}")
|
||||||
return 4
|
|
||||||
elif dim == 8192:
|
|
||||||
return 8
|
|
||||||
else:
|
|
||||||
print("Invalid dim: " + str(dim))
|
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
# possible data types
|
print(f"n_parts = {n_parts}\n")
|
||||||
# ftype == 0 -> float32
|
return n_parts
|
||||||
# ftype == 1 -> float16
|
|
||||||
#
|
|
||||||
# map from ftype to string
|
|
||||||
ftype_str = ["f32", "f16"]
|
|
||||||
|
|
||||||
ftype = 1
|
def load_hparams_and_tokenizer(dir_model):
|
||||||
if len(sys.argv) > 2:
|
|
||||||
ftype = int(sys.argv[2])
|
|
||||||
if ftype < 0 or ftype > 1:
|
|
||||||
print("Invalid ftype: " + str(ftype))
|
|
||||||
sys.exit(1)
|
|
||||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
|
|
||||||
|
|
||||||
if os.path.exists(fname_out):
|
# `dir_model` is something like `models/7B` or `models/7B/`.
|
||||||
print(f"Skip conversion, it already exists: {fname_out}")
|
# "tokenizer.model" is expected under model's parent dir.
|
||||||
sys.exit(0)
|
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
|
||||||
|
# Let's use the model's parent dir directly.
|
||||||
|
model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
|
||||||
|
|
||||||
with open(fname_hparams, "r") as f:
|
fname_hparams = f"{dir_model}/params.json"
|
||||||
hparams = json.load(f)
|
fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
|
||||||
|
|
||||||
tokenizer = SentencePieceProcessor(fname_tokenizer)
|
with open(fname_hparams, "r") as f:
|
||||||
|
hparams = json.load(f)
|
||||||
|
print(hparams)
|
||||||
|
|
||||||
hparams.update({"vocab_size": tokenizer.vocab_size()})
|
tokenizer = SentencePieceProcessor(fname_tokenizer)
|
||||||
|
hparams.update({"vocab_size": tokenizer.vocab_size()})
|
||||||
|
|
||||||
n_parts = get_n_parts(hparams["dim"])
|
return hparams, tokenizer
|
||||||
|
|
||||||
print(hparams)
|
def write_header(fout, hparams, ftype):
|
||||||
print('n_parts = ', n_parts)
|
|
||||||
|
|
||||||
for p in range(n_parts):
|
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
|
||||||
print('Processing part ', p)
|
values = [
|
||||||
|
0x67676d66, # magic: ggmf in hex
|
||||||
|
1, # file version
|
||||||
|
*[hparams[key] for key in keys],
|
||||||
|
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
|
||||||
|
ftype
|
||||||
|
]
|
||||||
|
fout.write(struct.pack("i" * len(values), *values))
|
||||||
|
|
||||||
#fname_model = sys.argv[1] + "/consolidated.00.pth"
|
def write_tokens(fout, tokenizer):
|
||||||
fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
|
|
||||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
|
|
||||||
if (p > 0):
|
|
||||||
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)
|
|
||||||
|
|
||||||
model = torch.load(fname_model, map_location="cpu")
|
|
||||||
|
|
||||||
fout = open(fname_out, "wb")
|
|
||||||
|
|
||||||
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
|
|
||||||
fout.write(struct.pack("i", hparams["vocab_size"]))
|
|
||||||
fout.write(struct.pack("i", hparams["dim"]))
|
|
||||||
fout.write(struct.pack("i", hparams["multiple_of"]))
|
|
||||||
fout.write(struct.pack("i", hparams["n_heads"]))
|
|
||||||
fout.write(struct.pack("i", hparams["n_layers"]))
|
|
||||||
fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
|
|
||||||
fout.write(struct.pack("i", ftype))
|
|
||||||
|
|
||||||
# Is this correct??
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(tokenizer.vocab_size()):
|
||||||
if tokenizer.is_unknown(i):
|
if tokenizer.is_unknown(i):
|
||||||
# "<unk>" token (translated as ??)
|
|
||||||
text = " \u2047 ".encode("utf-8")
|
text = " \u2047 ".encode("utf-8")
|
||||||
fout.write(struct.pack("i", len(text)))
|
|
||||||
fout.write(text)
|
|
||||||
elif tokenizer.is_control(i):
|
elif tokenizer.is_control(i):
|
||||||
# "<s>"/"</s>" tokens
|
text = b""
|
||||||
fout.write(struct.pack("i", 0))
|
|
||||||
elif tokenizer.is_byte(i):
|
elif tokenizer.is_byte(i):
|
||||||
# "<U+XX>" tokens (which may be invalid UTF-8)
|
|
||||||
piece = tokenizer.id_to_piece(i)
|
piece = tokenizer.id_to_piece(i)
|
||||||
if len(piece) != 6:
|
if len(piece) != 6:
|
||||||
print("Invalid token: " + piece)
|
print(f"Invalid token: {piece}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
byte_value = int(piece[3:-1], 16)
|
byte_value = int(piece[3:-1], 16)
|
||||||
fout.write(struct.pack("i", 1))
|
text = struct.pack("B", byte_value)
|
||||||
fout.write(struct.pack("B", byte_value))
|
|
||||||
else:
|
else:
|
||||||
# normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
|
|
||||||
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
||||||
fout.write(struct.pack("i", len(text)))
|
fout.write(struct.pack("i", len(text)))
|
||||||
fout.write(text)
|
fout.write(text)
|
||||||
|
fout.write(struct.pack("f", tokenizer.get_score(i)))
|
||||||
|
|
||||||
for k, v in model.items():
|
def process_and_write_variables(fout, model, ftype):
|
||||||
name = k
|
|
||||||
shape = v.shape
|
|
||||||
|
|
||||||
# skip layers.X.attention.inner_attention.rope.freqs
|
for name, datao in model.items():
|
||||||
if name[-5:] == "freqs":
|
|
||||||
|
if name.endswith("freqs"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
|
shape = datao.shape
|
||||||
|
|
||||||
#data = tf.train.load_variable(dir_model, name).squeeze()
|
print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
|
||||||
data = v.numpy().squeeze()
|
|
||||||
n_dims = len(data.shape);
|
|
||||||
|
|
||||||
# for efficiency - transpose some matrices
|
data = datao.numpy().squeeze()
|
||||||
# "model/h.*/attn/c_attn/w"
|
n_dims = len(shape)
|
||||||
# "model/h.*/attn/c_proj/w"
|
|
||||||
# "model/h.*/mlp/c_fc/w"
|
|
||||||
# "model/h.*/mlp/c_proj/w"
|
|
||||||
#if name[-14:] == "/attn/c_attn/w" or \
|
|
||||||
# name[-14:] == "/attn/c_proj/w" or \
|
|
||||||
# name[-11:] == "/mlp/c_fc/w" or \
|
|
||||||
# name[-13:] == "/mlp/c_proj/w":
|
|
||||||
# print(" Transposing")
|
|
||||||
# data = data.transpose()
|
|
||||||
|
|
||||||
dshape = data.shape
|
|
||||||
|
|
||||||
# default type is fp16
|
# default type is fp16
|
||||||
ftype_cur = 1
|
ftype_cur = 1
|
||||||
|
@ -164,18 +118,64 @@ for p in range(n_parts):
|
||||||
|
|
||||||
# header
|
# header
|
||||||
sname = name.encode('utf-8')
|
sname = name.encode('utf-8')
|
||||||
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
|
fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
|
||||||
for i in range(n_dims):
|
for dim in reversed(data.shape):
|
||||||
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
|
fout.write(struct.pack("i", dim))
|
||||||
fout.write(sname);
|
fout.write(sname)
|
||||||
|
|
||||||
# data
|
# data output to file
|
||||||
data.tofile(fout)
|
data.tofile(fout)
|
||||||
|
|
||||||
# I hope this deallocates the memory ..
|
def main():
|
||||||
model = None
|
|
||||||
|
|
||||||
fout.close()
|
args = parse_args()
|
||||||
|
dir_model = args.dir_model
|
||||||
|
ftype = args.ftype
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
|
||||||
print("Done. Output file: " + fname_out + ", (part ", p, ")")
|
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
|
||||||
print("")
|
|
||||||
|
print(args)
|
||||||
|
|
||||||
|
# if only writing vocab to file
|
||||||
|
if args.vocab_only:
|
||||||
|
|
||||||
|
fname_model = f"{dir_model}/consolidated.00.pth"
|
||||||
|
fname_out = f"{dir_model}/ggml-vocab.bin"
|
||||||
|
|
||||||
|
print(f"Extracting only the vocab from '{fname_model}'\n")
|
||||||
|
|
||||||
|
model = torch.load(fname_model, map_location="cpu")
|
||||||
|
|
||||||
|
with open(fname_out, "wb") as fout:
|
||||||
|
fout.write(struct.pack("i", hparams["vocab_size"]))
|
||||||
|
write_tokens(fout, tokenizer)
|
||||||
|
|
||||||
|
del model
|
||||||
|
|
||||||
|
print(f"Done. Output file: {fname_out}\n")
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
n_parts = get_n_parts(hparams["dim"])
|
||||||
|
|
||||||
|
for p in range(n_parts):
|
||||||
|
|
||||||
|
print(f"Processing part {p}\n")
|
||||||
|
|
||||||
|
fname_model = f"{dir_model}/consolidated.0{p}.pth"
|
||||||
|
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
|
||||||
|
|
||||||
|
model = torch.load(fname_model, map_location="cpu")
|
||||||
|
|
||||||
|
with open(fname_out, "wb") as fout:
|
||||||
|
write_header(fout, hparams, ftype)
|
||||||
|
write_tokens(fout, tokenizer)
|
||||||
|
process_and_write_variables(fout, model, ftype)
|
||||||
|
|
||||||
|
del model
|
||||||
|
|
||||||
|
print(f"Done. Output file: {fname_out}, (part {p})\n")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
53
examples/chatLLaMa
Executable file
53
examples/chatLLaMa
Executable file
|
@ -0,0 +1,53 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
cd "$(dirname "$0")/.." || exit
|
||||||
|
|
||||||
|
MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
|
||||||
|
USER_NAME="${USER_NAME:-User}"
|
||||||
|
AI_NAME="${AI_NAME:-ChatLLaMa}"
|
||||||
|
|
||||||
|
# Adjust to the number of CPU cores you want to use.
|
||||||
|
N_THREAD="${N_THREAD:-8}"
|
||||||
|
# Number of tokens to predict (made it larger than default because we want a long interaction)
|
||||||
|
N_PREDICTS="${N_PREDICTS:-2048}"
|
||||||
|
|
||||||
|
# Note: you can also override the generation options by specifying them on the command line:
|
||||||
|
# For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
|
||||||
|
GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --repeat_penalty 1.17647}"
|
||||||
|
|
||||||
|
# shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
|
||||||
|
./main $GEN_OPTIONS \
|
||||||
|
--model "$MODEL" \
|
||||||
|
--threads "$N_THREAD" \
|
||||||
|
--n_predict "$N_PREDICTS" \
|
||||||
|
--color --interactive \
|
||||||
|
--reverse-prompt "${USER_NAME}:" \
|
||||||
|
--prompt "
|
||||||
|
Text transcript of a never ending dialog, where ${USER_NAME} interacts with an AI assistant named ${AI_NAME}.
|
||||||
|
${AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer ${USER_NAME}’s requests immediately and with details and precision.
|
||||||
|
There are no annotations like (30 seconds passed...) or (to himself), just what ${USER_NAME} and ${AI_NAME} say alound to each other.
|
||||||
|
The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
|
||||||
|
The transcript only includes text, it does not include markup like HTML and Markdown.
|
||||||
|
|
||||||
|
$USER_NAME: Hello, $AI_NAME!
|
||||||
|
$AI_NAME: Hello $USER_NAME! How may I help you today?
|
||||||
|
$USER_NAME: What time is it?
|
||||||
|
$AI_NAME: It is $(date +%H:%M).
|
||||||
|
$USER_NAME: What year is it?
|
||||||
|
$AI_NAME: We are in $(date +%Y).
|
||||||
|
$USER_NAME: Please tell me the largest city in Europe.
|
||||||
|
$AI_NAME: The largest city in Europe is Moscow, the capital of Russia.
|
||||||
|
$USER_NAME: What can you tell me about Moscow?
|
||||||
|
$AI_NAME: Moscow, on the Moskva River in western Russia, is the nation’s cosmopolitan capital. In its historic core is the Kremlin, a complex that’s home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center.
|
||||||
|
$USER_NAME: What is a cat?
|
||||||
|
$AI_NAME: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
|
||||||
|
$USER_NAME: How do I pass command line arguments to a Node.js program?
|
||||||
|
$AI_NAME: The arguments are stored in process.argv.
|
||||||
|
|
||||||
|
argv[0] is the path to the Node. js executable.
|
||||||
|
argv[1] is the path to the script file.
|
||||||
|
argv[2] is the first argument passed to the script.
|
||||||
|
argv[3] is the second argument passed to the script and so on.
|
||||||
|
$USER_NAME: Name a color.
|
||||||
|
$AI_NAME: Blue
|
||||||
|
$USER_NAME:" "$@"
|
|
@ -34,6 +34,7 @@
|
||||||
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
|
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
|
||||||
chmod +x $out/bin/convert-pth-to-ggml
|
chmod +x $out/bin/convert-pth-to-ggml
|
||||||
'';
|
'';
|
||||||
|
meta.mainProgram = "llama";
|
||||||
};
|
};
|
||||||
devShells.default = pkgs.mkShell {
|
devShells.default = pkgs.mkShell {
|
||||||
packages = with pkgs; [
|
packages = with pkgs; [
|
||||||
|
|
86
ggml.c
86
ggml.c
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__)
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||||
#include <alloca.h>
|
#include <alloca.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -361,7 +361,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||||
|
|
||||||
// AVX routines provided by GH user Const-me
|
// AVX routines provided by GH user Const-me
|
||||||
// ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600
|
// ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600
|
||||||
#if __AVX2__
|
#if __AVX2__ || __AVX512F__
|
||||||
// Unpack 32 4-bit fields into 32 bytes
|
// Unpack 32 4-bit fields into 32 bytes
|
||||||
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
|
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
|
||||||
static inline __m256i bytesFromNibbles( const uint8_t* rsi )
|
static inline __m256i bytesFromNibbles( const uint8_t* rsi )
|
||||||
|
@ -397,7 +397,6 @@ static inline __m128i packNibbles( __m256i bytes )
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// method 5
|
// method 5
|
||||||
// blocks of QK elements
|
// blocks of QK elements
|
||||||
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
|
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
|
||||||
|
@ -1262,6 +1261,47 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if __AVX512F__ && QK == 32
|
||||||
|
static inline __m512 dot_q4_0_oneblock_avx512(
|
||||||
|
__m512 acc,
|
||||||
|
const uint8_t * pd0,
|
||||||
|
const uint8_t * pd1,
|
||||||
|
const uint8_t * pb0,
|
||||||
|
const uint8_t * pb1,
|
||||||
|
size_t bs,
|
||||||
|
int i
|
||||||
|
) {
|
||||||
|
const float * d0_0 = (const float *) (pd0 + i*bs);
|
||||||
|
const float * d1_0 = (const float *) (pd1 + i*bs);
|
||||||
|
|
||||||
|
const uint8_t * restrict p0 = pb0 + (i+0)*bs;
|
||||||
|
const uint8_t * restrict p1 = pb1 + (i+0)*bs;
|
||||||
|
|
||||||
|
// Compute combined scale for the block
|
||||||
|
float scaleScalar = d0_0[0] * d1_0[0];
|
||||||
|
__m512 scale = _mm512_set1_ps( scaleScalar );
|
||||||
|
|
||||||
|
__m256i bx = bytesFromNibbles( p0 );
|
||||||
|
__m256i by = bytesFromNibbles( p1 );
|
||||||
|
|
||||||
|
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
|
||||||
|
const __m256i off = _mm256_set1_epi8( 8 );
|
||||||
|
bx = _mm256_sub_epi8( bx, off );
|
||||||
|
by = _mm256_sub_epi8( by, off );
|
||||||
|
|
||||||
|
// Sign-extend 16 signed bytes into int16_t
|
||||||
|
__m512i x32 = _mm512_cvtepi8_epi16( bx );
|
||||||
|
__m512i y32 = _mm512_cvtepi8_epi16( by );
|
||||||
|
// Compute products of int16_t integers, add pairwise
|
||||||
|
__m512i i64 = _mm512_madd_epi16( x32, y32 );
|
||||||
|
|
||||||
|
// Convert int32_t to float
|
||||||
|
__m512 p = _mm512_cvtepi32_ps( i64 );
|
||||||
|
// Apply the scale, and accumulate
|
||||||
|
return _mm512_fmadd_ps( scale, p, acc );
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
|
||||||
ggml_float sumf = 0.0;
|
ggml_float sumf = 0.0;
|
||||||
|
|
||||||
|
@ -1417,6 +1457,40 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
|
||||||
#else
|
#else
|
||||||
#error "not implemented for QK"
|
#error "not implemented for QK"
|
||||||
#endif
|
#endif
|
||||||
|
#elif defined(__AVX512F__)
|
||||||
|
|
||||||
|
#if QK == 32
|
||||||
|
// Initialize accumulator with zeros
|
||||||
|
__m512 acc0 = _mm512_setzero_ps();
|
||||||
|
__m512 acc1 = _mm512_setzero_ps();
|
||||||
|
|
||||||
|
const int superblock_size = 8;
|
||||||
|
const int superblock_count = nb / superblock_size;
|
||||||
|
const int remainder = nb % superblock_size;
|
||||||
|
|
||||||
|
for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
|
||||||
|
int i = superblock_ix * superblock_size;
|
||||||
|
|
||||||
|
acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+0 );
|
||||||
|
acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+1 );
|
||||||
|
acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+2 );
|
||||||
|
acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+3 );
|
||||||
|
acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+4 );
|
||||||
|
acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+5 );
|
||||||
|
acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+6 );
|
||||||
|
acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+7 );
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remainders
|
||||||
|
for (int i = superblock_count * superblock_size; i < nb; ++i) {
|
||||||
|
acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i );
|
||||||
|
}
|
||||||
|
|
||||||
|
// Horizontal sum of all lanes of the accumulator
|
||||||
|
sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 );
|
||||||
|
#else
|
||||||
|
#error "not implemented for QK"
|
||||||
|
#endif
|
||||||
#elif defined(__AVX2__)
|
#elif defined(__AVX2__)
|
||||||
#if QK == 32
|
#if QK == 32
|
||||||
const size_t countBlocks = nb;
|
const size_t countBlocks = nb;
|
||||||
|
@ -1928,7 +2002,7 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res
|
||||||
const size_t bs = 2*sizeof(float) + QK/2;
|
const size_t bs = 2*sizeof(float) + QK/2;
|
||||||
|
|
||||||
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
|
const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
|
||||||
const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
|
const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
|
||||||
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
|
const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
|
||||||
|
|
||||||
for (int i = 0; i < nb; i++) {
|
for (int i = 0; i < nb; i++) {
|
||||||
|
@ -5556,7 +5630,7 @@ static void ggml_compute_forward_rms_norm_f32(
|
||||||
const size_t nb2 = dst->nb[2];
|
const size_t nb2 = dst->nb[2];
|
||||||
const size_t nb3 = dst->nb[3];
|
const size_t nb3 = dst->nb[3];
|
||||||
|
|
||||||
const ggml_float eps = 1e-5f; // TODO: make this a parameter
|
const ggml_float eps = 1e-6f; // TODO: make this a parameter
|
||||||
|
|
||||||
// TODO: optimize
|
// TODO: optimize
|
||||||
for (int i03 = 0; i03 < ne03; i03++) {
|
for (int i03 = 0; i03 < ne03; i03++) {
|
||||||
|
@ -5572,7 +5646,7 @@ static void ggml_compute_forward_rms_norm_f32(
|
||||||
mean /= ne00;
|
mean /= ne00;
|
||||||
|
|
||||||
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
|
||||||
|
|
||||||
memcpy(y, x, ne00 * sizeof(float));
|
memcpy(y, x, ne00 * sizeof(float));
|
||||||
// for (int i00 = 0; i00 < ne00; i00++) {
|
// for (int i00 = 0; i00 < ne00; i00++) {
|
||||||
// y[i00] = x[i00];
|
// y[i00] = x[i00];
|
||||||
|
|
367
main.cpp
367
main.cpp
|
@ -3,10 +3,12 @@
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
@ -18,6 +20,13 @@
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined (_WIN32)
|
||||||
|
#pragma comment(lib,"kernel32.lib")
|
||||||
|
extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
|
||||||
|
extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
|
||||||
|
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
|
||||||
|
#endif
|
||||||
|
|
||||||
#define ANSI_COLOR_RED "\x1b[31m"
|
#define ANSI_COLOR_RED "\x1b[31m"
|
||||||
#define ANSI_COLOR_GREEN "\x1b[32m"
|
#define ANSI_COLOR_GREEN "\x1b[32m"
|
||||||
#define ANSI_COLOR_YELLOW "\x1b[33m"
|
#define ANSI_COLOR_YELLOW "\x1b[33m"
|
||||||
|
@ -27,6 +36,8 @@
|
||||||
#define ANSI_COLOR_RESET "\x1b[0m"
|
#define ANSI_COLOR_RESET "\x1b[0m"
|
||||||
#define ANSI_BOLD "\x1b[1m"
|
#define ANSI_BOLD "\x1b[1m"
|
||||||
|
|
||||||
|
static const int EOS_TOKEN_ID = 2;
|
||||||
|
|
||||||
// determine number of model parts based on the dimension
|
// determine number of model parts based on the dimension
|
||||||
static const std::map<int, int> LLAMA_N_PARTS = {
|
static const std::map<int, int> LLAMA_N_PARTS = {
|
||||||
{ 4096, 1 },
|
{ 4096, 1 },
|
||||||
|
@ -86,7 +97,8 @@ struct llama_model {
|
||||||
};
|
};
|
||||||
|
|
||||||
// load the model's weights from a file
|
// load the model's weights from a file
|
||||||
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
|
|
||||||
|
bool llama_model_load(const std::string & fname, llama_model & model, llama_vocab & vocab, int n_ctx, int n_parts, ggml_type memory_type = GGML_TYPE_F32) {
|
||||||
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||||
|
|
||||||
std::vector<char> f_buf(1024*1024);
|
std::vector<char> f_buf(1024*1024);
|
||||||
|
@ -102,14 +114,27 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
{
|
{
|
||||||
uint32_t magic;
|
uint32_t magic;
|
||||||
fin.read((char *) &magic, sizeof(magic));
|
fin.read((char *) &magic, sizeof(magic));
|
||||||
if (magic != 0x67676d6c) {
|
if (magic == FILE_MAGIC_UNVERSIONED) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
|
||||||
|
__func__, fname.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (magic != FILE_MAGIC) {
|
||||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint32_t format_version;
|
||||||
|
fin.read((char *) &format_version, sizeof(format_version));
|
||||||
|
|
||||||
|
if (format_version != FILE_VERSION) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
|
||||||
|
__func__, fname.c_str(), format_version, FILE_VERSION);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_ff = 0;
|
int n_ff = 0;
|
||||||
int n_parts = 0;
|
|
||||||
|
|
||||||
// load hparams
|
// load hparams
|
||||||
{
|
{
|
||||||
|
@ -127,7 +152,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
hparams.n_ctx = n_ctx;
|
hparams.n_ctx = n_ctx;
|
||||||
|
|
||||||
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
|
||||||
n_parts = hparams.f16 == 4 ? 1 : LLAMA_N_PARTS.at(hparams.n_embd);
|
|
||||||
|
if (n_parts < 1) {
|
||||||
|
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
|
||||||
|
}
|
||||||
|
|
||||||
|
// temp warning to tell the user to use "--n_parts"
|
||||||
|
if (hparams.f16 == 4 && n_parts != 1) {
|
||||||
|
fprintf(stderr, "%s: GPTQ model detected - are you sure n_parts should be %d? we normally expect it to be 1\n", __func__, n_parts);
|
||||||
|
fprintf(stderr, "%s: use '--n_parts 1' if necessary\n", __func__);
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||||
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||||
|
@ -144,19 +178,27 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
// load vocab
|
// load vocab
|
||||||
{
|
{
|
||||||
std::string word;
|
std::string word;
|
||||||
|
std::vector<char> tmp(64);
|
||||||
|
|
||||||
for (int i = 0; i < model.hparams.n_vocab; i++) {
|
for (int i = 0; i < model.hparams.n_vocab; i++) {
|
||||||
uint32_t len;
|
uint32_t len;
|
||||||
fin.read((char *) &len, sizeof(len));
|
fin.read((char *) &len, sizeof(len));
|
||||||
|
|
||||||
word.resize(len);
|
word.resize(len);
|
||||||
fin.read((char *) word.data(), len);
|
if (len > 0) {
|
||||||
|
tmp.resize(len);
|
||||||
|
fin.read(tmp.data(), len);
|
||||||
|
word.assign(tmp.data(), len);
|
||||||
|
} else {
|
||||||
|
word.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
float score;
|
||||||
|
fin.read((char *) &score, sizeof(score));
|
||||||
|
|
||||||
vocab.token_to_id[word] = i;
|
vocab.token_to_id[word] = i;
|
||||||
vocab.id_to_token[i] = word;
|
vocab.id_to_token[i] = word;
|
||||||
|
vocab.score[i] = score;
|
||||||
//if (i < 30000) {
|
|
||||||
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
|
|
||||||
//}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -178,8 +220,6 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const ggml_type wtype2 = GGML_TYPE_F32;
|
|
||||||
|
|
||||||
auto & ctx = model.ctx;
|
auto & ctx = model.ctx;
|
||||||
|
|
||||||
size_t ctx_size = 0;
|
size_t ctx_size = 0;
|
||||||
|
@ -211,8 +251,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
|
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
|
||||||
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
|
ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
|
||||||
|
|
||||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
|
||||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
|
||||||
|
|
||||||
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
||||||
|
|
||||||
|
@ -239,7 +279,6 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
|
|
||||||
const int n_embd = hparams.n_embd;
|
const int n_embd = hparams.n_embd;
|
||||||
const int n_layer = hparams.n_layer;
|
const int n_layer = hparams.n_layer;
|
||||||
const int n_ctx = hparams.n_ctx;
|
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
model.layers.resize(n_layer);
|
model.layers.resize(n_layer);
|
||||||
|
@ -298,8 +337,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
|
||||||
const int n_mem = n_layer*n_ctx;
|
const int n_mem = n_layer*n_ctx;
|
||||||
const int n_elements = n_embd*n_mem;
|
const int n_elements = n_embd*n_mem;
|
||||||
|
|
||||||
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
model.memory_k = ggml_new_tensor_1d(ctx, memory_type, n_elements);
|
||||||
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
|
model.memory_v = ggml_new_tensor_1d(ctx, memory_type, n_elements);
|
||||||
|
|
||||||
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
|
||||||
|
|
||||||
|
@ -527,9 +566,10 @@ bool llama_eval(
|
||||||
const llama_model & model,
|
const llama_model & model,
|
||||||
const int n_threads,
|
const int n_threads,
|
||||||
const int n_past,
|
const int n_past,
|
||||||
const std::vector<gpt_vocab::id> & embd_inp,
|
const std::vector<llama_vocab::id> & embd_inp,
|
||||||
std::vector<float> & embd_w,
|
std::vector<float> & embd_w,
|
||||||
size_t & mem_per_token) {
|
size_t & mem_per_token,
|
||||||
|
bool return_all_logits = false) {
|
||||||
const int N = embd_inp.size();
|
const int N = embd_inp.size();
|
||||||
|
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
@ -541,15 +581,13 @@ bool llama_eval(
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
const int n_rot = hparams.n_embd/hparams.n_head;
|
const int n_rot = hparams.n_embd/hparams.n_head;
|
||||||
|
|
||||||
const int d_key = n_embd/n_head;
|
// TODO: check if this size scales with n_ctx linearly and remove constant. somehow I feel it wasn't the case
|
||||||
|
|
||||||
// TODO: check if this size scales with n_ctx linearly and remove constant. somehow I feel it wasn't the case
|
|
||||||
// static size_t buf_size = hparams.n_ctx*1024*1024;
|
// static size_t buf_size = hparams.n_ctx*1024*1024;
|
||||||
static size_t buf_size = 512u*1024*1024;
|
static size_t buf_size = 512u*1024*1024;
|
||||||
static void * buf = malloc(buf_size);
|
static void * buf = malloc(buf_size);
|
||||||
|
|
||||||
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
|
||||||
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
const size_t buf_size_new = 1.3*(mem_per_token*N); // add 30% to account for ggml object overhead
|
||||||
//fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
//fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
|
||||||
|
|
||||||
// reallocate
|
// reallocate
|
||||||
|
@ -735,9 +773,14 @@ bool llama_eval(
|
||||||
//embd_w.resize(n_vocab*N);
|
//embd_w.resize(n_vocab*N);
|
||||||
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||||
|
|
||||||
// return result for just the last token
|
if (return_all_logits) {
|
||||||
embd_w.resize(n_vocab);
|
embd_w.resize(n_vocab * N);
|
||||||
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
memcpy(embd_w.data(), (float *) ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||||
|
} else {
|
||||||
|
// return result for just the last token
|
||||||
|
embd_w.resize(n_vocab);
|
||||||
|
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||||
|
}
|
||||||
|
|
||||||
if (mem_per_token == 0) {
|
if (mem_per_token == 0) {
|
||||||
mem_per_token = ggml_used_mem(ctx0)/N;
|
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||||
|
@ -749,11 +792,82 @@ bool llama_eval(
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<double> softmax(const std::vector<float>& logits) {
|
||||||
|
std::vector<double> probs(logits.size());
|
||||||
|
float max_logit = logits[0];
|
||||||
|
for (float v : logits) max_logit = std::max(max_logit, v);
|
||||||
|
double sum_exp = 0.0;
|
||||||
|
for (size_t i = 0; i < logits.size(); i++) {
|
||||||
|
// Subtract the maximum logit value from the current logit value for numerical stability
|
||||||
|
float logit = logits[i] - max_logit;
|
||||||
|
double exp_logit = std::exp(logit);
|
||||||
|
sum_exp += exp_logit;
|
||||||
|
probs[i] = exp_logit;
|
||||||
|
}
|
||||||
|
for (size_t i = 0; i < probs.size(); i++) probs[i] /= sum_exp;
|
||||||
|
return probs;
|
||||||
|
}
|
||||||
|
|
||||||
|
void perplexity(const llama_vocab &vocab, const llama_model &model, const gpt_params ¶ms, size_t mem_per_token) {
|
||||||
|
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
|
||||||
|
// Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
|
std::vector<llama_vocab::id> tokens = ::llama_tokenize(vocab, params.prompt, true);
|
||||||
|
|
||||||
|
int count = 0;
|
||||||
|
double nll = 0.0;
|
||||||
|
int seq_count = tokens.size() / params.n_ctx;
|
||||||
|
printf("Calculating perplexity over %d chunks\n", seq_count);
|
||||||
|
for (int i = 0; i < seq_count; ++i) {
|
||||||
|
int start = i * params.n_ctx;
|
||||||
|
int end = start + params.n_ctx - 1;
|
||||||
|
std::vector<llama_vocab::id> embd(tokens.begin() + start, tokens.begin() + end);
|
||||||
|
std::vector<float> logits;
|
||||||
|
auto start_t = std::chrono::high_resolution_clock::now();
|
||||||
|
if (!llama_eval(model, params.n_threads, 0, embd, logits, mem_per_token, true)) {
|
||||||
|
fprintf(stderr, "Failed to predict\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto end_t = std::chrono::high_resolution_clock::now();
|
||||||
|
if (i == 0) {
|
||||||
|
double seconds = std::chrono::duration<double>(end_t - start_t).count();
|
||||||
|
printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
|
||||||
|
}
|
||||||
|
// We get the logits for all the tokens in the context window (params.n_ctx)
|
||||||
|
// from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity,
|
||||||
|
// calculate the perplexity over the last half the window (so the model always has
|
||||||
|
// some context to predict the token).
|
||||||
|
//
|
||||||
|
// We rely on the fact that attention in the forward pass only looks at previous
|
||||||
|
// tokens here, so the logits returned for each token are an accurate representation
|
||||||
|
// of what the model would have predicted at that point.
|
||||||
|
//
|
||||||
|
// Example, we have a context window of 512, we will compute perplexity for each of the
|
||||||
|
// last 256 tokens. Then, we split the input up into context window size chunks to
|
||||||
|
// process the entire prompt.
|
||||||
|
for (int j = params.n_ctx / 2; j < params.n_ctx - 1; ++j) {
|
||||||
|
// Calculate probability of next token, given the previous ones.
|
||||||
|
int n_vocab = model.hparams.n_vocab;
|
||||||
|
std::vector<float> tok_logits(
|
||||||
|
logits.begin() + j * n_vocab,
|
||||||
|
logits.begin() + (j + 1) * n_vocab);
|
||||||
|
double prob = softmax(tok_logits)[tokens[start + j + 1]];
|
||||||
|
nll += -std::log(prob);
|
||||||
|
++count;
|
||||||
|
}
|
||||||
|
// perplexity is e^(average negative log-likelihood)
|
||||||
|
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
static bool is_interacting = false;
|
static bool is_interacting = false;
|
||||||
|
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
void sigint_handler(int signo) {
|
void sigint_handler(int signo) {
|
||||||
printf(ANSI_COLOR_RESET);
|
printf(ANSI_COLOR_RESET);
|
||||||
|
printf("\n"); // this also force flush stdout.
|
||||||
if (signo == SIGINT) {
|
if (signo == SIGINT) {
|
||||||
if (!is_interacting) {
|
if (!is_interacting) {
|
||||||
is_interacting=true;
|
is_interacting=true;
|
||||||
|
@ -795,6 +909,11 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (params.n_ctx > 2048) {
|
||||||
|
fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
|
||||||
|
"expect poor results\n", __func__, params.n_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
if (params.seed < 0) {
|
if (params.seed < 0) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
|
@ -802,7 +921,7 @@ int main(int argc, char ** argv) {
|
||||||
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
||||||
|
|
||||||
std::mt19937 rng(params.seed);
|
std::mt19937 rng(params.seed);
|
||||||
if (params.prompt.empty()) {
|
if (params.random_prompt) {
|
||||||
params.prompt = gpt_random_prompt(rng);
|
params.prompt = gpt_random_prompt(rng);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -811,13 +930,14 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
int64_t t_load_us = 0;
|
int64_t t_load_us = 0;
|
||||||
|
|
||||||
gpt_vocab vocab;
|
llama_vocab vocab;
|
||||||
llama_model model;
|
llama_model model;
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
{
|
{
|
||||||
|
const ggml_type memory_type = params.memory_f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_time_us();
|
||||||
if (!llama_model_load(params.model, model, vocab, params.n_ctx)) {
|
if (!llama_model_load(params.model, model, vocab, params.n_ctx, params.n_parts, memory_type)) {
|
||||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -832,22 +952,43 @@ int main(int argc, char ** argv) {
|
||||||
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<float> logits;
|
||||||
|
|
||||||
|
// determine the required inference memory per token:
|
||||||
|
size_t mem_per_token = 0;
|
||||||
|
llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
||||||
|
|
||||||
|
if (params.perplexity) {
|
||||||
|
perplexity(vocab, model, params, mem_per_token);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
int64_t t_sample_us = 0;
|
int64_t t_sample_us = 0;
|
||||||
int64_t t_predict_us = 0;
|
int64_t t_predict_us = 0;
|
||||||
|
|
||||||
std::vector<float> logits;
|
|
||||||
|
|
||||||
// Add a space in front of the first character to match OG llama tokenizer behavior
|
// Add a space in front of the first character to match OG llama tokenizer behavior
|
||||||
params.prompt.insert(0, 1, ' ');
|
params.prompt.insert(0, 1, ' ');
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
|
std::vector<llama_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
|
||||||
|
|
||||||
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
|
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
|
||||||
|
|
||||||
// tokenize the reverse prompt
|
// prefix & suffix for instruct mode
|
||||||
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
|
const std::vector<llama_vocab::id> inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true);
|
||||||
|
const std::vector<llama_vocab::id> inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false);
|
||||||
|
|
||||||
|
// in instruct mode, we inject a prefix and a suffix to each input by the user
|
||||||
|
if (params.instruct) {
|
||||||
|
params.interactive = true;
|
||||||
|
params.antiprompt.push_back("### Instruction:\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// enable interactive mode if reverse prompt is specified
|
||||||
|
if (params.antiprompt.size() != 0) {
|
||||||
|
params.interactive = true;
|
||||||
|
}
|
||||||
|
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
|
@ -869,53 +1010,50 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
fprintf(stderr, "%s: interactive mode on.\n", __func__);
|
fprintf(stderr, "%s: interactive mode on.\n", __func__);
|
||||||
|
|
||||||
if(antiprompt_inp.size()) {
|
if(params.antiprompt.size()) {
|
||||||
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
|
for (auto antiprompt : params.antiprompt) {
|
||||||
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
|
fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||||
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
|
|
||||||
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
|
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
|
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
|
||||||
fprintf(stderr, "\n\n");
|
fprintf(stderr, "\n\n");
|
||||||
|
|
||||||
std::vector<gpt_vocab::id> embd;
|
std::vector<llama_vocab::id> embd;
|
||||||
|
|
||||||
// determine the required inference memory per token:
|
|
||||||
size_t mem_per_token = 0;
|
|
||||||
llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
|
|
||||||
|
|
||||||
int last_n_size = params.repeat_last_n;
|
int last_n_size = params.repeat_last_n;
|
||||||
std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
|
std::vector<llama_vocab::id> last_n_tokens(last_n_size);
|
||||||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
||||||
|
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
fprintf(stderr, "== Running in interactive mode. ==\n"
|
fprintf(stderr, "== Running in interactive mode. ==\n"
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
" - Press Ctrl+C to interject at any time.\n"
|
" - Press Ctrl+C to interject at any time.\n"
|
||||||
#endif
|
#endif
|
||||||
" - Press Return to return control to LLaMa.\n"
|
" - Press Return to return control to LLaMa.\n"
|
||||||
" - If you want to submit another line, end your input in '\\'.\n");
|
" - If you want to submit another line, end your input in '\\'.\n\n");
|
||||||
}
|
|
||||||
|
|
||||||
int remaining_tokens = params.n_predict;
|
|
||||||
int input_consumed = 0;
|
|
||||||
bool input_noecho = false;
|
|
||||||
|
|
||||||
// prompt user immediately after the starting prompt has been loaded
|
|
||||||
if (params.interactive_start) {
|
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int input_consumed = 0;
|
||||||
|
bool input_noecho = false;
|
||||||
|
|
||||||
|
int remaining_tokens = params.n_predict;
|
||||||
|
|
||||||
// set the color for the prompt which will be output initially
|
// set the color for the prompt which will be output initially
|
||||||
if (params.use_color) {
|
if (params.use_color) {
|
||||||
|
#if defined (_WIN32)
|
||||||
|
// Enable ANSI colors on Windows 10+
|
||||||
|
unsigned long dwMode = 0;
|
||||||
|
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
|
||||||
|
if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
|
||||||
|
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
|
||||||
|
}
|
||||||
|
#endif
|
||||||
printf(ANSI_COLOR_YELLOW);
|
printf(ANSI_COLOR_YELLOW);
|
||||||
}
|
}
|
||||||
|
|
||||||
while (remaining_tokens > 0) {
|
while (remaining_tokens > 0 || params.interactive) {
|
||||||
// predict
|
// predict
|
||||||
if (embd.size() > 0) {
|
if (embd.size() > 0) {
|
||||||
const int64_t t_start_us = ggml_time_us();
|
const int64_t t_start_us = ggml_time_us();
|
||||||
|
@ -931,7 +1069,7 @@ int main(int argc, char ** argv) {
|
||||||
n_past += embd.size();
|
n_past += embd.size();
|
||||||
embd.clear();
|
embd.clear();
|
||||||
|
|
||||||
if (embd_inp.size() <= input_consumed) {
|
if ((int) embd_inp.size() <= input_consumed) {
|
||||||
// out of user input, sample next token
|
// out of user input, sample next token
|
||||||
const float top_k = params.top_k;
|
const float top_k = params.top_k;
|
||||||
const float top_p = params.top_p;
|
const float top_p = params.top_p;
|
||||||
|
@ -940,11 +1078,16 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const int n_vocab = model.hparams.n_vocab;
|
const int n_vocab = model.hparams.n_vocab;
|
||||||
|
|
||||||
gpt_vocab::id id = 0;
|
llama_vocab::id id = 0;
|
||||||
|
|
||||||
{
|
{
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
|
if (params.ignore_eos) {
|
||||||
|
// set the logit of the eos token to zero to avoid sampling it
|
||||||
|
logits[logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
|
id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);
|
||||||
|
|
||||||
last_n_tokens.erase(last_n_tokens.begin());
|
last_n_tokens.erase(last_n_tokens.begin());
|
||||||
|
@ -963,20 +1106,15 @@ int main(int argc, char ** argv) {
|
||||||
--remaining_tokens;
|
--remaining_tokens;
|
||||||
} else {
|
} else {
|
||||||
// some user input remains from prompt or interaction, forward it to processing
|
// some user input remains from prompt or interaction, forward it to processing
|
||||||
while (embd_inp.size() > input_consumed) {
|
while ((int) embd_inp.size() > input_consumed) {
|
||||||
embd.push_back(embd_inp[input_consumed]);
|
embd.push_back(embd_inp[input_consumed]);
|
||||||
last_n_tokens.erase(last_n_tokens.begin());
|
last_n_tokens.erase(last_n_tokens.begin());
|
||||||
last_n_tokens.push_back(embd_inp[input_consumed]);
|
last_n_tokens.push_back(embd_inp[input_consumed]);
|
||||||
++input_consumed;
|
++input_consumed;
|
||||||
if (embd.size() > params.n_batch) {
|
if ((int) embd.size() >= params.n_batch) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// reset color to default if we there is no pending user input
|
|
||||||
if (!input_noecho && params.use_color && embd_inp.size() == input_consumed) {
|
|
||||||
printf(ANSI_COLOR_RESET);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// display text
|
// display text
|
||||||
|
@ -986,56 +1124,79 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
// reset color to default if we there is no pending user input
|
||||||
|
if (!input_noecho && params.use_color && (int)embd_inp.size() == input_consumed) {
|
||||||
|
printf(ANSI_COLOR_RESET);
|
||||||
|
}
|
||||||
|
|
||||||
// in interactive mode, and not currently processing queued inputs;
|
// in interactive mode, and not currently processing queued inputs;
|
||||||
// check if we should prompt the user for more
|
// check if we should prompt the user for more
|
||||||
if (params.interactive && embd_inp.size() <= input_consumed) {
|
if (params.interactive && (int) embd_inp.size() <= input_consumed) {
|
||||||
// check for reverse prompt
|
// check for reverse prompt
|
||||||
if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) {
|
std::string last_output;
|
||||||
// reverse prompt found
|
for (auto id : last_n_tokens) {
|
||||||
is_interacting = true;
|
last_output += vocab.id_to_token[id];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if each of the reverse prompts appears at the end of the output.
|
||||||
|
for (std::string antiprompt : params.antiprompt) {
|
||||||
|
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
|
||||||
|
is_interacting = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (is_interacting) {
|
if (is_interacting) {
|
||||||
// currently being interactive
|
if (params.instruct) {
|
||||||
bool another_line=true;
|
input_consumed = embd_inp.size();
|
||||||
while (another_line) {
|
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
|
||||||
fflush(stdout);
|
|
||||||
char buf[256] = {0};
|
|
||||||
int n_read;
|
|
||||||
if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
|
|
||||||
if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
|
|
||||||
// presumable empty line, consume the newline
|
|
||||||
std::ignore = scanf("%*c");
|
|
||||||
n_read=0;
|
|
||||||
}
|
|
||||||
if(params.use_color) printf(ANSI_COLOR_RESET);
|
|
||||||
|
|
||||||
if (n_read > 0 && buf[n_read-1]=='\\') {
|
printf("\n> ");
|
||||||
another_line = true;
|
|
||||||
buf[n_read-1] = '\n';
|
|
||||||
buf[n_read] = 0;
|
|
||||||
} else {
|
|
||||||
another_line = false;
|
|
||||||
buf[n_read] = '\n';
|
|
||||||
buf[n_read+1] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
|
|
||||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
|
||||||
|
|
||||||
remaining_tokens -= line_inp.size();
|
|
||||||
|
|
||||||
input_noecho = true; // do not echo this again
|
|
||||||
}
|
}
|
||||||
|
|
||||||
is_interacting = false;
|
// currently being interactive
|
||||||
|
if (params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
|
||||||
|
std::string buffer;
|
||||||
|
std::string line;
|
||||||
|
bool another_line = true;
|
||||||
|
do {
|
||||||
|
std::getline(std::cin, line);
|
||||||
|
if (line.empty() || line.back() != '\\') {
|
||||||
|
another_line = false;
|
||||||
|
} else {
|
||||||
|
line.pop_back(); // Remove the continue character
|
||||||
|
}
|
||||||
|
buffer += line + '\n'; // Append the line to the result
|
||||||
|
} while (another_line);
|
||||||
|
if (params.use_color) printf(ANSI_COLOR_RESET);
|
||||||
|
|
||||||
|
std::vector<llama_vocab::id> line_inp = ::llama_tokenize(vocab, buffer, false);
|
||||||
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||||
|
|
||||||
|
if (params.instruct) {
|
||||||
|
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
|
||||||
|
}
|
||||||
|
|
||||||
|
remaining_tokens -= line_inp.size();
|
||||||
|
|
||||||
|
input_noecho = true; // do not echo this again
|
||||||
}
|
}
|
||||||
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// end of text token
|
// end of text token
|
||||||
if (embd.back() == 2) {
|
if (embd.back() == EOS_TOKEN_ID) {
|
||||||
fprintf(stderr, " [end of text]\n");
|
if (params.interactive) {
|
||||||
break;
|
is_interacting = true;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, " [end of text]\n");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
|
||||||
|
if (params.interactive && remaining_tokens <= 0) {
|
||||||
|
remaining_tokens = params.n_predict;
|
||||||
|
is_interacting = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
BIN
models/ggml-vocab.bin
Normal file
BIN
models/ggml-vocab.bin
Normal file
Binary file not shown.
1
prompts/alpaca.txt
Normal file
1
prompts/alpaca.txt
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Below is an instruction that describes a task. Write a response that appropriately completes the request.
|
7
prompts/chat-with-bob.txt
Normal file
7
prompts/chat-with-bob.txt
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
|
||||||
|
|
||||||
|
User: Hello, Bob.
|
||||||
|
Bob: Hello. How may I help you today?
|
||||||
|
User: Please tell me the largest city in Europe.
|
||||||
|
Bob: Sure. The largest city in Europe is Moscow, the capital of Russia.
|
||||||
|
User:
|
26
quantize.cpp
26
quantize.cpp
|
@ -3,6 +3,7 @@
|
||||||
#include "utils.h"
|
#include "utils.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
|
@ -43,7 +44,7 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_vocab vocab;
|
llama_vocab vocab;
|
||||||
|
|
||||||
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
|
||||||
|
|
||||||
|
@ -63,12 +64,28 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
|
||||||
{
|
{
|
||||||
uint32_t magic;
|
uint32_t magic;
|
||||||
finp.read((char *) &magic, sizeof(magic));
|
finp.read((char *) &magic, sizeof(magic));
|
||||||
if (magic != 0x67676d6c) {
|
if (magic == FILE_MAGIC_UNVERSIONED) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
|
||||||
|
__func__, fname_inp.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (magic != FILE_MAGIC) {
|
||||||
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
|
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
fout.write((char *) &magic, sizeof(magic));
|
fout.write((char *) &magic, sizeof(magic));
|
||||||
|
|
||||||
|
uint32_t format_version;
|
||||||
|
finp.read((char *) &format_version, sizeof(format_version));
|
||||||
|
|
||||||
|
if (format_version != FILE_VERSION) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
|
||||||
|
__func__, fname_inp.c_str(), format_version, FILE_VERSION);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fout.write((char *) &format_version, sizeof(format_version));
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_hparams hparams;
|
llama_hparams hparams;
|
||||||
|
@ -122,8 +139,13 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
|
||||||
finp.read ((char *) word.data(), len);
|
finp.read ((char *) word.data(), len);
|
||||||
fout.write((char *) word.data(), len);
|
fout.write((char *) word.data(), len);
|
||||||
|
|
||||||
|
float score;
|
||||||
|
finp.read ((char *) &score, sizeof(score));
|
||||||
|
fout.write((char *) &score, sizeof(score));
|
||||||
|
|
||||||
vocab.token_to_id[word] = i;
|
vocab.token_to_id[word] = i;
|
||||||
vocab.id_to_token[i] = word;
|
vocab.id_to_token[i] = word;
|
||||||
|
vocab.score[i] = score;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
126
quantize.py
Normal file
126
quantize.py
Normal file
|
@ -0,0 +1,126 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""Script to execute the "quantize" script on a given set of models."""
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
import glob
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Update the quantize binary name depending on the platform and parse
|
||||||
|
the command line arguments and execute the script.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if "linux" in sys.platform or "darwin" in sys.platform:
|
||||||
|
quantize_script_binary = "quantize"
|
||||||
|
|
||||||
|
elif "win32" in sys.platform or "cygwin" in sys.platform:
|
||||||
|
quantize_script_binary = "quantize.exe"
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
|
||||||
|
quantize_script_binary = "quantize"
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
prog='python3 quantize.py',
|
||||||
|
description='This script quantizes the given models by applying the '
|
||||||
|
f'"{quantize_script_binary}" script on them.'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
|
||||||
|
help='The models to quantize.'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-r', '--remove-16', action='store_true', dest='remove_f16',
|
||||||
|
help='Remove the f16 model after quantizing it.'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-m', '--models-path', dest='models_path',
|
||||||
|
default=os.path.join(os.getcwd(), "models"),
|
||||||
|
help='Specify the directory where the models are located.'
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'-q', '--quantize-script-path', dest='quantize_script_path',
|
||||||
|
default=os.path.join(os.getcwd(), quantize_script_binary),
|
||||||
|
help='Specify the path to the "quantize" script.'
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO: Revise this code
|
||||||
|
# parser.add_argument(
|
||||||
|
# '-t', '--threads', dest='threads', type='int',
|
||||||
|
# default=os.cpu_count(),
|
||||||
|
# help='Specify the number of threads to use to quantize many models at '
|
||||||
|
# 'once. Defaults to os.cpu_count().'
|
||||||
|
# )
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not os.path.isfile(args.quantize_script_path):
|
||||||
|
print(
|
||||||
|
f'The "{quantize_script_binary}" script was not found in the '
|
||||||
|
"current location.\nIf you want to use it from another location, "
|
||||||
|
"set the --quantize-script-path argument from the command line."
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
for model in args.models:
|
||||||
|
# The model is separated in various parts
|
||||||
|
# (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
|
||||||
|
f16_model_path_base = os.path.join(
|
||||||
|
args.models_path, model, "ggml-model-f16.bin"
|
||||||
|
)
|
||||||
|
|
||||||
|
f16_model_parts_paths = map(
|
||||||
|
lambda filename: os.path.join(f16_model_path_base, filename),
|
||||||
|
glob.glob(f"{f16_model_path_base}*")
|
||||||
|
)
|
||||||
|
|
||||||
|
for f16_model_part_path in f16_model_parts_paths:
|
||||||
|
if not os.path.isfile(f16_model_part_path):
|
||||||
|
print(
|
||||||
|
f"The f16 model {os.path.basename(f16_model_part_path)} "
|
||||||
|
f"was not found in {args.models_path}{os.path.sep}{model}"
|
||||||
|
". If you want to use it from another location, set the "
|
||||||
|
"--models-path argument from the command line."
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
__run_quantize_script(
|
||||||
|
args.quantize_script_path, f16_model_part_path
|
||||||
|
)
|
||||||
|
|
||||||
|
if args.remove_f16:
|
||||||
|
os.remove(f16_model_part_path)
|
||||||
|
|
||||||
|
|
||||||
|
# This was extracted to a top-level function for parallelization, if
|
||||||
|
# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
|
||||||
|
|
||||||
|
def __run_quantize_script(script_path, f16_model_part_path):
|
||||||
|
"""Run the quantize script specifying the path to it and the path to the
|
||||||
|
f16 model to quantize.
|
||||||
|
"""
|
||||||
|
|
||||||
|
new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
|
||||||
|
subprocess.run(
|
||||||
|
[script_path, f16_model_part_path, new_quantized_model_path, "2"],
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
main()
|
||||||
|
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
print("\nAn error ocurred while trying to quantize the models.")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print("\nSuccesfully quantized all models.")
|
15
quantize.sh
15
quantize.sh
|
@ -1,15 +0,0 @@
|
||||||
#!/usr/bin/env bash
|
|
||||||
|
|
||||||
if ! [[ "$1" =~ ^[0-9]{1,2}B$ ]]; then
|
|
||||||
echo
|
|
||||||
echo "Usage: quantize.sh 7B|13B|30B|65B [--remove-f16]"
|
|
||||||
echo
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
for i in `ls models/$1/ggml-model-f16.bin*`; do
|
|
||||||
./quantize "$i" "${i/f16/q4_0}" 2
|
|
||||||
if [[ "$2" == "--remove-f16" ]]; then
|
|
||||||
rm "$i"
|
|
||||||
fi
|
|
||||||
done
|
|
4
tests/CMakeLists.txt
Normal file
4
tests/CMakeLists.txt
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
set(TEST_TARGET test-tokenizer-0)
|
||||||
|
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
|
||||||
|
target_link_libraries(${TEST_TARGET} PRIVATE utils)
|
||||||
|
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
|
69
tests/test-tokenizer-0.cpp
Normal file
69
tests/test-tokenizer-0.cpp
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
#include "utils.h"
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <map>
|
||||||
|
|
||||||
|
static const std::map<std::string, std::vector<llama_vocab::id>> k_tests = {
|
||||||
|
{ "Hello World", { 1, 10994, 2787, }, },
|
||||||
|
{ " Hello World", { 1, 15043, 2787, }, },
|
||||||
|
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
|
||||||
|
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
|
||||||
|
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
|
||||||
|
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
|
||||||
|
};
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
if (argc < 2) {
|
||||||
|
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string fname = argv[1];
|
||||||
|
|
||||||
|
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
llama_vocab vocab;
|
||||||
|
|
||||||
|
if (!llama_vocab_load(fname, vocab)) {
|
||||||
|
fprintf(stderr, "%s : failed to load vocab from: '%s'\n", __func__, fname.c_str());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int n_vocab = vocab.id_to_token.size();
|
||||||
|
|
||||||
|
if (n_vocab != 32000) {
|
||||||
|
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const auto & test_kv : k_tests) {
|
||||||
|
const auto res = llama_tokenize(vocab, test_kv.first, true);
|
||||||
|
|
||||||
|
bool correct = res.size() == test_kv.second.size();
|
||||||
|
|
||||||
|
for (int i = 0; i < (int) res.size() && correct; ++i) {
|
||||||
|
if (res[i] != test_kv.second[i]) {
|
||||||
|
correct = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!correct) {
|
||||||
|
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
|
||||||
|
fprintf(stderr, "%s : expected tokens: ", __func__);
|
||||||
|
for (const auto & t : test_kv.second) {
|
||||||
|
fprintf(stderr, "%6d, ", t);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "%s : got tokens: ", __func__);
|
||||||
|
for (const auto & t : res) {
|
||||||
|
fprintf(stderr, "%6d, ", t);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
330
utils.cpp
330
utils.cpp
|
@ -6,12 +6,13 @@
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
|
#include <queue>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <math.h>
|
#include <math.h>
|
||||||
|
|
||||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||||
#elif !defined(__FreeBSD__) && !defined(__NetBSD__)
|
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
|
||||||
#include <alloca.h>
|
#include <alloca.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -38,19 +39,19 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
} else if (arg == "-p" || arg == "--prompt") {
|
} else if (arg == "-p" || arg == "--prompt") {
|
||||||
params.prompt = argv[++i];
|
params.prompt = argv[++i];
|
||||||
} else if (arg == "-f" || arg == "--file") {
|
} else if (arg == "-f" || arg == "--file") {
|
||||||
|
|
||||||
std::ifstream file(argv[++i]);
|
std::ifstream file(argv[++i]);
|
||||||
|
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
|
||||||
std::copy(std::istreambuf_iterator<char>(file),
|
if (params.prompt.back() == '\n') {
|
||||||
std::istreambuf_iterator<char>(),
|
params.prompt.pop_back();
|
||||||
back_inserter(params.prompt));
|
}
|
||||||
|
|
||||||
} else if (arg == "-n" || arg == "--n_predict") {
|
} else if (arg == "-n" || arg == "--n_predict") {
|
||||||
params.n_predict = std::stoi(argv[++i]);
|
params.n_predict = std::stoi(argv[++i]);
|
||||||
} else if (arg == "--top_k") {
|
} else if (arg == "--top_k") {
|
||||||
params.top_k = std::stoi(argv[++i]);
|
params.top_k = std::stoi(argv[++i]);
|
||||||
} else if (arg == "-c" || arg == "--ctx_size") {
|
} else if (arg == "-c" || arg == "--ctx_size") {
|
||||||
params.n_ctx = std::stoi(argv[++i]);
|
params.n_ctx = std::stoi(argv[++i]);
|
||||||
|
} else if (arg == "--memory_f16") {
|
||||||
|
params.memory_f16 = true;
|
||||||
} else if (arg == "--top_p") {
|
} else if (arg == "--top_p") {
|
||||||
params.top_p = std::stof(argv[++i]);
|
params.top_p = std::stof(argv[++i]);
|
||||||
} else if (arg == "--temp") {
|
} else if (arg == "--temp") {
|
||||||
|
@ -65,16 +66,23 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
params.model = argv[++i];
|
params.model = argv[++i];
|
||||||
} else if (arg == "-i" || arg == "--interactive") {
|
} else if (arg == "-i" || arg == "--interactive") {
|
||||||
params.interactive = true;
|
params.interactive = true;
|
||||||
} else if (arg == "--interactive-start") {
|
} else if (arg == "-ins" || arg == "--instruct") {
|
||||||
params.interactive = true;
|
params.instruct = true;
|
||||||
params.interactive_start = true;
|
|
||||||
} else if (arg == "--color") {
|
} else if (arg == "--color") {
|
||||||
params.use_color = true;
|
params.use_color = true;
|
||||||
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
} else if (arg == "-r" || arg == "--reverse-prompt") {
|
||||||
params.antiprompt = argv[++i];
|
params.antiprompt.push_back(argv[++i]);
|
||||||
|
} else if (arg == "--perplexity") {
|
||||||
|
params.perplexity = true;
|
||||||
|
} else if (arg == "--ignore-eos") {
|
||||||
|
params.ignore_eos = true;
|
||||||
|
} else if (arg == "--n_parts") {
|
||||||
|
params.n_parts = std::stoi(argv[++i]);
|
||||||
} else if (arg == "-h" || arg == "--help") {
|
} else if (arg == "-h" || arg == "--help") {
|
||||||
gpt_print_usage(argc, argv, params);
|
gpt_print_usage(argc, argv, params);
|
||||||
exit(0);
|
exit(0);
|
||||||
|
} else if (arg == "--random-prompt") {
|
||||||
|
params.random_prompt = true;
|
||||||
} else {
|
} else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
gpt_print_usage(argc, argv, params);
|
gpt_print_usage(argc, argv, params);
|
||||||
|
@ -85,20 +93,22 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
|
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stderr, "options:\n");
|
||||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
fprintf(stderr, " -h, --help show this help message and exit\n");
|
||||||
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
fprintf(stderr, " -i, --interactive run in interactive mode\n");
|
||||||
fprintf(stderr, " --interactive-start run in interactive mode and poll user input at startup\n");
|
fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||||
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n");
|
||||||
fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT\n");
|
fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT (can be\n");
|
||||||
|
fprintf(stderr, " specified more than once for multiple prompts).\n");
|
||||||
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n");
|
||||||
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
|
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
|
||||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||||
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
|
||||||
fprintf(stderr, " prompt to start generation with (default: random)\n");
|
fprintf(stderr, " prompt to start generation with (default: empty)\n");
|
||||||
|
fprintf(stderr, " --random-prompt start with a randomized prompt.\n");
|
||||||
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
fprintf(stderr, " -f FNAME, --file FNAME\n");
|
||||||
fprintf(stderr, " prompt file to start generation.\n");
|
fprintf(stderr, " prompt file to start generation.\n");
|
||||||
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
|
||||||
|
@ -107,8 +117,12 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
|
fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
|
||||||
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
|
fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
|
||||||
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
|
fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||||
|
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
|
||||||
|
fprintf(stderr, " --memory_f16 use f16 instead of f32 for memory key+value\n");
|
||||||
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
|
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
|
||||||
|
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
|
||||||
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||||
|
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
|
||||||
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
fprintf(stderr, " -m FNAME, --model FNAME\n");
|
||||||
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
@ -232,151 +246,209 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
static size_t utf8_len(char src) {
|
||||||
std::vector<std::string> words;
|
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||||
|
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
||||||
// first split the text into words
|
return lookup[highbits];
|
||||||
{
|
|
||||||
std::string str = text;
|
|
||||||
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
|
||||||
|
|
||||||
std::regex re(pat);
|
|
||||||
std::smatch m;
|
|
||||||
|
|
||||||
while (std::regex_search(str, m, re)) {
|
|
||||||
for (auto x : m) {
|
|
||||||
words.push_back(x);
|
|
||||||
}
|
|
||||||
str = m.suffix();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// find the longest tokens that form the words:
|
|
||||||
std::vector<gpt_vocab::id> tokens;
|
|
||||||
for (const auto & word : words) {
|
|
||||||
if (word.size() == 0) continue;
|
|
||||||
|
|
||||||
int i = 0;
|
|
||||||
int n = word.size();
|
|
||||||
while (i < n) {
|
|
||||||
int j = n;
|
|
||||||
while (j > i) {
|
|
||||||
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
|
||||||
if (it != vocab.token_to_id.end()) {
|
|
||||||
tokens.push_back(it->second);
|
|
||||||
i = j;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
--j;
|
|
||||||
}
|
|
||||||
if (i == n) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (j == i) {
|
|
||||||
auto sub = word.substr(i, 1);
|
|
||||||
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
|
||||||
tokens.push_back(vocab.token_to_id.at(sub));
|
|
||||||
} else {
|
|
||||||
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
|
||||||
}
|
|
||||||
++i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return tokens;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Calculate this constant from the vocabulary
|
struct llama_sp_symbol {
|
||||||
#define MAX_TOKEN_LEN 18
|
using index = int;
|
||||||
// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
|
index prev;
|
||||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
|
index next;
|
||||||
std::vector<gpt_vocab::id> res;
|
const char * text;
|
||||||
std::vector<int> score;
|
size_t n;
|
||||||
std::vector<gpt_vocab::id> prev;
|
};
|
||||||
int len = text.length();
|
|
||||||
|
|
||||||
score.resize(len + 1);
|
struct llama_sp_bigram {
|
||||||
prev.resize(len + 1);
|
struct comparator {
|
||||||
|
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
||||||
|
return (l.score < r.score) || (l.score == r.score && l.left > r.left);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
using queue_storage = std::vector<llama_sp_bigram>;
|
||||||
|
using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
|
||||||
|
llama_sp_symbol::index left;
|
||||||
|
llama_sp_symbol::index right;
|
||||||
|
float score;
|
||||||
|
size_t size;
|
||||||
|
};
|
||||||
|
|
||||||
// Forward pass
|
// original implementation:
|
||||||
for (int i = 0; i < len; i++) {
|
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
|
||||||
int max_len = std::min(len - i, MAX_TOKEN_LEN);
|
struct llama_tokenizer {
|
||||||
for (int sub_len = 1; sub_len <= max_len; sub_len++) {
|
llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
|
||||||
auto sub = text.substr(i, sub_len);
|
|
||||||
auto token = vocab.token_to_id.find(sub);
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
||||||
if (token != vocab.token_to_id.end()) {
|
// split string into utf8 chars
|
||||||
int token_score = sub.length() * sub.length();
|
int index = 0;
|
||||||
int local_score = score[i] + token_score;
|
size_t offs = 0;
|
||||||
int next = i + sub_len;
|
while (offs < text.size()) {
|
||||||
if (score[next] < local_score) {
|
llama_sp_symbol sym;
|
||||||
score[next] = local_score;
|
size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
|
||||||
prev[next] = (*token).second;
|
sym.text = text.c_str() + offs;
|
||||||
|
sym.n = char_len;
|
||||||
|
offs += char_len;
|
||||||
|
sym.prev = index - 1;
|
||||||
|
sym.next = offs == text.size() ? -1 : index + 1;
|
||||||
|
index++;
|
||||||
|
symbols_.emplace_back(std::move(sym));
|
||||||
|
}
|
||||||
|
|
||||||
|
// seed the work queue with all possible 2-character tokens.
|
||||||
|
for (size_t i = 1; i < symbols_.size(); ++i) {
|
||||||
|
try_add_bigram(i - 1, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// keep substituting the highest frequency pairs for as long as we can.
|
||||||
|
while (!work_queue_.empty()) {
|
||||||
|
auto bigram = work_queue_.top();
|
||||||
|
work_queue_.pop();
|
||||||
|
|
||||||
|
auto & left_sym = symbols_[bigram.left];
|
||||||
|
auto & right_sym = symbols_[bigram.right];
|
||||||
|
|
||||||
|
// if one of the symbols already got merged, skip it.
|
||||||
|
if (left_sym.n == 0 || right_sym.n == 0 ||
|
||||||
|
left_sym.n + right_sym.n != bigram.size) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// merge the right sym into the left one
|
||||||
|
left_sym.n += right_sym.n;
|
||||||
|
right_sym.n = 0;
|
||||||
|
|
||||||
|
//printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
|
||||||
|
|
||||||
|
// remove the right sym from the chain
|
||||||
|
left_sym.next = right_sym.next;
|
||||||
|
if (right_sym.next >= 0) {
|
||||||
|
symbols_[right_sym.next].prev = bigram.left;
|
||||||
|
}
|
||||||
|
|
||||||
|
// find more substitutions
|
||||||
|
try_add_bigram(left_sym.prev, bigram.left);
|
||||||
|
try_add_bigram(bigram.left, left_sym.next);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i != -1; i = symbols_[i].next) {
|
||||||
|
auto & symbol = symbols_[i];
|
||||||
|
auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));
|
||||||
|
|
||||||
|
if (token == vocab_.token_to_id.end()) {
|
||||||
|
// output any symbols that did not form tokens as bytes.
|
||||||
|
for (int j = 0; j < (int) symbol.n; ++j) {
|
||||||
|
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
|
||||||
|
output.push_back(token_id);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
output.push_back((*token).second);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Backward pass
|
private:
|
||||||
int i = len;
|
void try_add_bigram(int left, int right) {
|
||||||
while (i > 0) {
|
if (left == -1 || right == -1) {
|
||||||
gpt_vocab::id token_id = prev[i];
|
return;
|
||||||
if (token_id == 0) {
|
|
||||||
// TODO: Return error or something more meaningful
|
|
||||||
printf("failed to tokenize string!\n");
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
res.push_back(token_id);
|
|
||||||
auto token = (*vocab.id_to_token.find(token_id)).second;
|
const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
|
||||||
i -= token.length();
|
auto token = vocab_.token_to_id.find(text);
|
||||||
|
|
||||||
|
if (token == vocab_.token_to_id.end()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto score = vocab_.score.find((*token).second);
|
||||||
|
|
||||||
|
if (score == vocab_.score.end()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sp_bigram bigram;
|
||||||
|
bigram.left = left;
|
||||||
|
bigram.right = right;
|
||||||
|
bigram.score = (*score).second;
|
||||||
|
bigram.size = text.size();
|
||||||
|
work_queue_.push(bigram);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (bos) {
|
const llama_vocab & vocab_;
|
||||||
res.push_back(1); // TODO: replace with vocab.bos
|
std::vector<llama_sp_symbol> symbols_;
|
||||||
|
llama_sp_bigram::queue work_queue_;
|
||||||
|
};
|
||||||
|
|
||||||
|
// TODO: temporary code duplication with llama.cpp
|
||||||
|
// will resolve after #77 is merged
|
||||||
|
bool llama_vocab_load(const std::string & fname, llama_vocab & vocab) {
|
||||||
|
std::ifstream fin(fname, std::ios::binary);
|
||||||
|
if (!fin.is_open()) {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pieces are in reverse order so correct that
|
int n_vocab = 0;
|
||||||
std::reverse(res.begin(), res.end());
|
fin.read((char *) &n_vocab, sizeof(n_vocab));
|
||||||
|
|
||||||
return res;
|
std::string word;
|
||||||
}
|
std::vector<char> tmp(64);
|
||||||
|
|
||||||
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
for (int i = 0; i < n_vocab; i++) {
|
||||||
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
|
uint32_t len;
|
||||||
|
fin.read((char *) &len, sizeof(len));
|
||||||
|
|
||||||
vocab.token_to_id = ::json_parse(fname);
|
word.resize(len);
|
||||||
|
if (len > 0) {
|
||||||
|
tmp.resize(len);
|
||||||
|
fin.read(tmp.data(), len);
|
||||||
|
word.assign(tmp.data(), len);
|
||||||
|
} else {
|
||||||
|
word.clear();
|
||||||
|
}
|
||||||
|
|
||||||
for (const auto & kv : vocab.token_to_id) {
|
float score;
|
||||||
vocab.id_to_token[kv.second] = kv.first;
|
fin.read((char *) &score, sizeof(score));
|
||||||
|
|
||||||
|
vocab.token_to_id[word] = i;
|
||||||
|
vocab.id_to_token[i] = word;
|
||||||
|
vocab.score[i] = score;
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
|
|
||||||
|
|
||||||
// print the vocabulary
|
|
||||||
//for (auto kv : vocab.token_to_id) {
|
|
||||||
// printf("'%s' -> %d\n", kv.first.data(), kv.second);
|
|
||||||
//}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
|
||||||
|
llama_tokenizer tokenizer(vocab);
|
||||||
|
std::vector<llama_vocab::id> output;
|
||||||
|
|
||||||
void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k) {
|
if (text.size() == 0) {
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (bos) {
|
||||||
|
output.push_back(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizer.tokenize(text, output);
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k) {
|
||||||
// find the top K tokens
|
// find the top K tokens
|
||||||
std::partial_sort(
|
std::partial_sort(
|
||||||
logits_id.begin(),
|
logits_id.begin(),
|
||||||
logits_id.begin() + top_k, logits_id.end(),
|
logits_id.begin() + top_k, logits_id.end(),
|
||||||
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
[](const std::pair<double, llama_vocab::id> & a, const std::pair<double, llama_vocab::id> & b) {
|
||||||
return a.first > b.first;
|
return a.first > b.first;
|
||||||
});
|
});
|
||||||
|
|
||||||
logits_id.resize(top_k);
|
logits_id.resize(top_k);
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_vocab::id llama_sample_top_p_top_k(
|
llama_vocab::id llama_sample_top_p_top_k(
|
||||||
const gpt_vocab & vocab,
|
const llama_vocab & vocab,
|
||||||
const float * logits,
|
const float * logits,
|
||||||
std::vector<gpt_vocab::id> & last_n_tokens,
|
std::vector<llama_vocab::id> & last_n_tokens,
|
||||||
double repeat_penalty,
|
double repeat_penalty,
|
||||||
int top_k,
|
int top_k,
|
||||||
double top_p,
|
double top_p,
|
||||||
|
@ -384,7 +456,7 @@ gpt_vocab::id llama_sample_top_p_top_k(
|
||||||
std::mt19937 & rng) {
|
std::mt19937 & rng) {
|
||||||
int n_logits = vocab.id_to_token.size();
|
int n_logits = vocab.id_to_token.size();
|
||||||
|
|
||||||
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
std::vector<std::pair<double, llama_vocab::id>> logits_id;
|
||||||
logits_id.reserve(n_logits);
|
logits_id.reserve(n_logits);
|
||||||
|
|
||||||
{
|
{
|
||||||
|
@ -398,7 +470,7 @@ gpt_vocab::id llama_sample_top_p_top_k(
|
||||||
logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i));
|
logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i));
|
||||||
} else {
|
} else {
|
||||||
logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i));
|
logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
logits_id.push_back(std::make_pair(logits[i]*scale, i));
|
logits_id.push_back(std::make_pair(logits[i]*scale, i));
|
||||||
}
|
}
|
||||||
|
@ -527,7 +599,7 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
||||||
|
|
||||||
char * pdst = (char *) dst;
|
char * pdst = (char *) dst;
|
||||||
|
|
||||||
for (int j = 0; j < n; j += k) {
|
for (int j = 0; j < n; j += k) {
|
||||||
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
|
uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
|
||||||
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
|
uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
|
||||||
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
|
uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
|
||||||
|
@ -550,7 +622,7 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
|
||||||
|
|
||||||
*(float *) pd = d;
|
*(float *) pd = d;
|
||||||
*(float *) pm = min;
|
*(float *) pm = min;
|
||||||
pd += bs;
|
pd += bs;
|
||||||
pm += bs;
|
pm += bs;
|
||||||
|
|
||||||
for (int l = 0; l < qk; l += 2) {
|
for (int l = 0; l < qk; l += 2) {
|
||||||
|
|
63
utils.h
63
utils.h
|
@ -13,28 +13,34 @@
|
||||||
//
|
//
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
int32_t seed = -1; // RNG seed
|
int32_t seed = -1; // RNG seed
|
||||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||||
int32_t n_predict = 128; // new tokens to predict
|
int32_t n_predict = 128; // new tokens to predict
|
||||||
int32_t repeat_last_n = 64; // last n tokens to penalize
|
int32_t repeat_last_n = 64; // last n tokens to penalize
|
||||||
int32_t n_ctx = 512; //context size
|
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
||||||
|
int32_t n_ctx = 512; //context size
|
||||||
|
|
||||||
// sampling parameters
|
// sampling parameters
|
||||||
int32_t top_k = 40;
|
int32_t top_k = 40;
|
||||||
float top_p = 0.95f;
|
float top_p = 0.95f;
|
||||||
float temp = 0.80f;
|
float temp = 0.80f;
|
||||||
float repeat_penalty = 1.30f;
|
float repeat_penalty = 1.10f;
|
||||||
|
|
||||||
int32_t n_batch = 8; // batch size for prompt processing
|
int32_t n_batch = 8; // batch size for prompt processing
|
||||||
|
|
||||||
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
|
||||||
std::string prompt;
|
std::string prompt = "";
|
||||||
|
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||||
|
|
||||||
bool interactive = false; // interactive mode
|
bool memory_f16 = false; // use f16 instead of f32 for memory kv
|
||||||
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
|
bool interactive = false; // interactive mode
|
||||||
bool interactive_start = false; // reverse prompt immediately
|
bool interactive_start = false; // reverse prompt immediately
|
||||||
std::string antiprompt = ""; // string upon seeing which more user input is prompted
|
bool instruct = false; // instruction mode (used for Alpaca models)
|
||||||
|
bool ignore_eos = false; // do not stop generating after eos
|
||||||
|
bool perplexity = false; // compute perplexity over the prompt
|
||||||
};
|
};
|
||||||
|
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||||
|
@ -43,16 +49,25 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||||
|
|
||||||
std::string gpt_random_prompt(std::mt19937 & rng);
|
std::string gpt_random_prompt(std::mt19937 & rng);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Model file parsing
|
||||||
|
//
|
||||||
|
|
||||||
|
#define FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
|
||||||
|
#define FILE_MAGIC 0x67676d66 // 'ggmf' in hex
|
||||||
|
#define FILE_VERSION 1
|
||||||
|
|
||||||
//
|
//
|
||||||
// Vocab utils
|
// Vocab utils
|
||||||
//
|
//
|
||||||
|
|
||||||
struct gpt_vocab {
|
struct llama_vocab {
|
||||||
using id = int32_t;
|
using id = int32_t;
|
||||||
using token = std::string;
|
using token = std::string;
|
||||||
|
|
||||||
std::map<token, id> token_to_id;
|
std::map<token, id> token_to_id;
|
||||||
std::map<id, token> id_to_token;
|
std::map<id, token> id_to_token;
|
||||||
|
std::map<id, float> score;
|
||||||
};
|
};
|
||||||
|
|
||||||
void replace(std::string & str, const std::string & needle, const std::string & replacement);
|
void replace(std::string & str, const std::string & needle, const std::string & replacement);
|
||||||
|
@ -60,34 +75,22 @@ void replace(std::string & str, const std::string & needle, const std::string &
|
||||||
// poor-man's JSON parsing
|
// poor-man's JSON parsing
|
||||||
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
||||||
|
|
||||||
// split text into tokens
|
// TODO: temporary until #77 is merged, need this now for some tokenizer tests
|
||||||
//
|
bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);
|
||||||
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
|
||||||
//
|
|
||||||
// Regex (Python):
|
|
||||||
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
|
||||||
//
|
|
||||||
// Regex (C++):
|
|
||||||
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
|
|
||||||
//
|
|
||||||
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
|
||||||
|
|
||||||
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
|
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
|
||||||
// ref: https://github.com/google/sentencepiece
|
// ref: https://github.com/google/sentencepiece
|
||||||
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
|
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos);
|
||||||
|
|
||||||
// load the tokens from encoder.json
|
|
||||||
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
|
||||||
|
|
||||||
// sample next token given probabilities for each embedding
|
// sample next token given probabilities for each embedding
|
||||||
//
|
//
|
||||||
// - consider only the top K tokens
|
// - consider only the top K tokens
|
||||||
// - from them, consider only the top tokens with cumulative probability > P
|
// - from them, consider only the top tokens with cumulative probability > P
|
||||||
//
|
//
|
||||||
gpt_vocab::id llama_sample_top_p_top_k(
|
llama_vocab::id llama_sample_top_p_top_k(
|
||||||
const gpt_vocab & vocab,
|
const llama_vocab & vocab,
|
||||||
const float * logits,
|
const float * logits,
|
||||||
std::vector<gpt_vocab::id> & last_n_tokens,
|
std::vector<llama_vocab::id> & last_n_tokens,
|
||||||
double repeat_penalty,
|
double repeat_penalty,
|
||||||
int top_k,
|
int top_k,
|
||||||
double top_p,
|
double top_p,
|
||||||
|
@ -95,7 +98,7 @@ gpt_vocab::id llama_sample_top_p_top_k(
|
||||||
std::mt19937 & rng);
|
std::mt19937 & rng);
|
||||||
|
|
||||||
// filer to top K tokens from list of logits
|
// filer to top K tokens from list of logits
|
||||||
void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k);
|
void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Quantization
|
// Quantization
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue