Merge remote-tracking branch 'origin/master' into perplexity

This commit is contained in:
Gary Linscott 2023-03-21 09:18:25 -07:00
commit 9d1cdb8938
17 changed files with 638 additions and 307 deletions

View file

@ -54,6 +54,7 @@ jobs:
cd build cd build
cmake .. cmake ..
cmake --build . --config Release cmake --build . --config Release
ctest --output-on-failure
macOS-latest-make: macOS-latest-make:
runs-on: macos-latest runs-on: macos-latest
@ -90,6 +91,7 @@ jobs:
cd build cd build
cmake .. cmake ..
cmake --build . --config Release cmake --build . --config Release
ctest --output-on-failure
windows-latest-cmake: windows-latest-cmake:
runs-on: windows-latest runs-on: windows-latest
@ -106,6 +108,7 @@ jobs:
cd build cd build
cmake .. cmake ..
cmake --build . --config Release cmake --build . --config Release
ctest --output-on-failure
- name: Get commit hash - name: Get commit hash
id: commit id: commit

View file

@ -40,7 +40,7 @@ jobs:
uses: docker/login-action@v2 uses: docker/login-action@v2
with: with:
registry: ghcr.io registry: ghcr.io
username: ${{ github.actor }} username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }} password: ${{ secrets.GITHUB_TOKEN }}
- name: Build and push Docker image (versioned) - name: Build and push Docker image (versioned)

View file

@ -1,131 +1,252 @@
cmake_minimum_required(VERSION 3.8) cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
project("llama.cpp") project("llama.cpp" C CXX)
set(CMAKE_CXX_STANDARD 20) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD 11)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE) if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif() endif()
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(LLAMA_STANDALONE ON)
# configure project version
# TODO
else()
set(LLAMA_STANDALONE OFF)
endif()
if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
else()
if (MINGW)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
else()
set(BUILD_SHARED_LIBS_DEFAULT ON)
endif()
endif()
#
# Option list
#
# general
option(LLAMA_STATIC "llama: static link libraries" OFF)
option(LLAMA_NATIVE "llama: enable -march=native flag" OFF)
option(LLAMA_LTO "llama: enable link time optimization" OFF)
# debug
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON) option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF) option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
option(LLAMA_GPROF "llama: enable gprof" OFF)
# sanitizers
option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF) option(LLAMA_SANITIZE_THREAD "llama: enable thread sanitizer" OFF)
option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF) option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF) option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
if (APPLE) # instruction set specific
option(LLAMA_NO_ACCELERATE "llama: disable Accelerate framework" OFF) option(LLAMA_AVX "llama: enable AVX" ON)
option(LLAMA_NO_AVX "llama: disable AVX" OFF) option(LLAMA_AVX2 "llama: enable AVX2" ON)
option(LLAMA_NO_AVX2 "llama: disable AVX2" OFF) option(LLAMA_FMA "llama: enable FMA" ON)
option(LLAMA_NO_FMA "llama: disable FMA" OFF)
endif() # 3rd party libs
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
#
# Compile flags
#
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(CMAKE_C_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
if (NOT MSVC) if (NOT MSVC)
if (LLAMA_SANITIZE_THREAD) if (LLAMA_SANITIZE_THREAD)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread") add_compile_options(-fsanitize=thread)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
endif() endif()
if (LLAMA_SANITIZE_ADDRESS) if (LLAMA_SANITIZE_ADDRESS)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer") add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
endif() endif()
if (LLAMA_SANITIZE_UNDEFINED) if (LLAMA_SANITIZE_UNDEFINED)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined") add_compile_options(-fsanitize=undefined)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
endif() endif()
endif() endif()
if (APPLE AND NOT LLAMA_NO_ACCELERATE) if (APPLE AND LLAMA_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate) find_library(ACCELERATE_FRAMEWORK Accelerate)
if (ACCELERATE_FRAMEWORK) if (ACCELERATE_FRAMEWORK)
message(STATUS "Accelerate framework found") message(STATUS "Accelerate framework found")
add_compile_definitions(GGML_USE_ACCELERATE)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
else() else()
message(WARNING "Accelerate framework not found") message(WARNING "Accelerate framework not found")
endif() endif()
endif() endif()
if (LLAMA_OPENBLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
endif()
set(BLA_VENDOR OpenBLAS)
find_package(BLAS)
if (BLAS_FOUND)
message(STATUS "OpenBLAS found")
add_compile_definitions(GGML_USE_OPENBLAS)
add_link_options(${BLAS_LIBRARIES})
else()
message(WARNING "OpenBLAS not found")
endif()
endif()
if (LLAMA_ALL_WARNINGS) if (LLAMA_ALL_WARNINGS)
if (NOT MSVC) if (NOT MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \ set(c_flags
-Wall \ -Wall
-Wextra \ -Wextra
-Wpedantic \ -Wpedantic
-Wshadow \ -Wshadow
-Wcast-qual \ -Wcast-qual
-Wstrict-prototypes \ -Wstrict-prototypes
-Wpointer-arith \ -Wpointer-arith
-Wno-unused-function \ -Wno-unused-function
") )
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \ set(cxx_flags
-Wall \ -Wall
-Wextra \ -Wextra
-Wpedantic \ -Wpedantic
-Wcast-qual \ -Wcast-qual
") )
else() else()
# todo : msvc # todo : msvc
endif() endif()
add_compile_options(
"$<$<COMPILE_LANGUAGE:C>:${c_flags}>"
"$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>"
)
endif() endif()
if (LLAMA_LTO)
include(CheckIPOSupported)
check_ipo_supported(RESULT result OUTPUT output)
if (result)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
else()
message(WARNING "IPO is not supported: ${output}")
endif()
endif()
# Architecture specific
# TODO: probably these flags need to be tweaked on some architectures
# feel free to update the Makefile for your architecture and send a pull request or issue
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}") message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (NOT MSVC)
if (LLAMA_STATIC)
add_link_options(-static)
if (MINGW)
add_link_options(-static-libgcc -static-libstdc++)
endif()
endif()
if (LLAMA_GPROF)
add_compile_options(-pg)
endif()
if (LLAMA_NATIVE)
add_compile_options(-march=native)
endif()
endif()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected") message(STATUS "ARM detected")
else() if (MSVC)
# TODO: arm msvc?
else()
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
add_compile_options(-mcpu=native)
endif()
# TODO: armv6,7,8 version specific flags
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
message(STATUS "x86 detected") message(STATUS "x86 detected")
if (MSVC) if (MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") if (LLAMA_AVX2)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") add_compile_options(/arch:AVX2)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2") elseif (LLAMA_AVX)
add_compile_options(/arch:AVX)
endif()
else() else()
if(NOT LLAMA_NO_AVX) add_compile_options(-mf16c)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx") if (LLAMA_FMA)
add_compile_options(-mfma)
endif() endif()
if(NOT LLAMA_NO_AVX2) if (LLAMA_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2") add_compile_options(-mavx)
endif() endif()
if(NOT LLAMA_NO_FMA) if (LLAMA_AVX2)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") add_compile_options(-mavx2)
endif() endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
endif() endif()
else()
# TODO: support PowerPC
message(STATUS "Unknown architecture")
endif() endif()
# if (LLAMA_PERF)
# set(LLAMA_EXTRA_FLAGS ${LLAMA_EXTRA_FLAGS} -DGGML_PERF)
# endif()
add_executable(llama #
main.cpp # Build library
#
add_executable(llama main.cpp)
add_executable(quantize quantize.cpp)
add_library(utils OBJECT
utils.cpp utils.cpp
utils.h) utils.h)
add_executable(quantize target_include_directories(utils PUBLIC .)
quantize.cpp target_compile_features(utils PUBLIC cxx_std_11) # don't bump
utils.cpp
utils.h)
add_library(ggml add_library(ggml OBJECT
ggml.c ggml.c
ggml.h) ggml.h)
target_compile_definitions(ggml PUBLIC ${LLAMA_EXTRA_FLAGS})
target_compile_definitions(llama PUBLIC ${LLAMA_EXTRA_FLAGS})
target_compile_definitions(quantize PUBLIC ${LLAMA_EXTRA_FLAGS})
target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
target_include_directories(ggml PUBLIC .) target_include_directories(ggml PUBLIC .)
target_link_libraries(quantize PRIVATE ggml) target_compile_features(ggml PUBLIC c_std_11) # don't bump
target_link_libraries(llama PRIVATE ggml)
target_link_libraries(ggml PRIVATE Threads::Threads) #
# Linking
#
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
target_link_libraries(llama PRIVATE ggml utils)
target_link_libraries(quantize PRIVATE ggml utils)
#
# programs, examples and tests
#
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
enable_testing()
add_subdirectory(tests)
endif ()
#if (LLAMA_BUILD_EXAMPLES)
# add_subdirectory(examples)
#endif()

View file

@ -17,7 +17,7 @@ CXXV := $(shell $(CXX) --version | head -n 1)
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
ifneq ($(UNAME_P),arm) ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64) SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1) ifeq ($(SYSCTL_M),1)
# UNAME_P := arm # UNAME_P := arm
# UNAME_M := arm64 # UNAME_M := arm64
@ -30,8 +30,9 @@ endif
# Compile flags # Compile flags
# #
# keep standard at C11 and C++11
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS = LDFLAGS =
# OS specific # OS specific
@ -52,6 +53,10 @@ ifeq ($(UNAME_S),NetBSD)
CFLAGS += -pthread CFLAGS += -pthread
CXXFLAGS += -pthread CXXFLAGS += -pthread
endif endif
ifeq ($(UNAME_S),OpenBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Haiku) ifeq ($(UNAME_S),Haiku)
CFLAGS += -pthread CFLAGS += -pthread
CXXFLAGS += -pthread CXXFLAGS += -pthread
@ -95,6 +100,38 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
ifneq (,$(findstring sse3,$(SSE3_M))) ifneq (,$(findstring sse3,$(SSE3_M)))
CFLAGS += -msse3 CFLAGS += -msse3
endif endif
AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
ifneq (,$(findstring avx512f,$(AVX512F_M)))
CFLAGS += -mavx512f
endif
AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
CFLAGS += -mavx512bw
endif
AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
CFLAGS += -mavx512dq
endif
AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
CFLAGS += -mavx512vl
endif
AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
CFLAGS += -mavx512cd
endif
AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
ifneq (,$(findstring avx512er,$(AVX512ER_M)))
CFLAGS += -mavx512er
endif
AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
CFLAGS += -mavx512ifma
endif
AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
CFLAGS += -mavx512pf
endif
else ifeq ($(UNAME_S),Haiku) else ifeq ($(UNAME_S),Haiku)
AVX1_M := $(shell sysinfo -cpu | grep "AVX ") AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
ifneq (,$(findstring avx,$(AVX1_M))) ifneq (,$(findstring avx,$(AVX1_M)))
@ -116,9 +153,6 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
CFLAGS += -mfma -mf16c -mavx -mavx2 CFLAGS += -mfma -mf16c -mavx -mavx2
endif endif
endif endif
ifeq ($(UNAME_M),amd64)
CFLAGS += -mavx -mavx2 -mfma -mf16c
endif
ifneq ($(filter ppc64%,$(UNAME_M)),) ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
ifneq (,$(findstring POWER9,$(POWER9_M))) ifneq (,$(findstring POWER9,$(POWER9_M)))
@ -130,7 +164,8 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
endif endif
endif endif
ifndef LLAMA_NO_ACCELERATE ifndef LLAMA_NO_ACCELERATE
# Mac M1 - include Accelerate framework # Mac M1 - include Accelerate framework.
# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
CFLAGS += -DGGML_USE_ACCELERATE CFLAGS += -DGGML_USE_ACCELERATE
LDFLAGS += -framework Accelerate LDFLAGS += -framework Accelerate
@ -193,7 +228,7 @@ clean:
main: main.cpp ggml.o utils.o main: main.cpp ggml.o utils.o
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS) $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
./main -h @echo "\x1b[36mrun ./main -h for help\x1b[0m"
quantize: quantize.cpp ggml.o utils.o quantize: quantize.cpp ggml.o utils.o
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS) $(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)

View file

@ -178,10 +178,15 @@ If you want a more ChatGPT-like experience, you can run in interactive mode by p
In this mode, you can always interrupt generation by pressing Ctrl+C and enter one or more lines of text which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt which makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. In this mode, you can always interrupt generation by pressing Ctrl+C and enter one or more lines of text which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt which makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
Here is an example few-shot interaction, invoked with the command Here is an example few-shot interaction, invoked with the command
```
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
```bash
# default arguments using 7B model
./chat.sh
# custom arguments using 13B model
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
``` ```
Note the use of `--color` to distinguish between user input and generated text. Note the use of `--color` to distinguish between user input and generated text.
![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png) ![image](https://user-images.githubusercontent.com/1991296/224575029-2af3c7dc-5a65-4f64-a6bb-517a532aea38.png)
@ -192,11 +197,10 @@ First, download the `ggml` Alpaca model into the `./models` folder:
``` ```
# use one of these # use one of these
# NOTE: these are copied from the alpaca.cpp repo - not sure how long these will work
# TODO: add a script to simplify the download # TODO: add a script to simplify the download
curl -o ggml-alpaca-7b-q4.bin -C - https://gateway.estuary.tech/gw/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC curl -o ./models/ggml-alpaca-7b-q4.bin -C - https://gateway.estuary.tech/gw/ipfs/QmUp1UGeQFDqJKvtjbSYPBiZZKRjLp8shVP9hT8ZB9Ynv1
curl -o ggml-alpaca-7b-q4.bin -C - https://ipfs.io/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC curl -o ./models/ggml-alpaca-7b-q4.bin -C - https://ipfs.io/ipfs/QmUp1UGeQFDqJKvtjbSYPBiZZKRjLp8shVP9hT8ZB9Ynv1
curl -o ggml-alpaca-7b-q4.bin -C - https://cloudflare-ipfs.com/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC curl -o ./models/ggml-alpaca-7b-q4.bin -C - https://cloudflare-ipfs.com/ipfs/QmUp1UGeQFDqJKvtjbSYPBiZZKRjLp8shVP9hT8ZB9Ynv1
``` ```
Now run the `main` tool like this: Now run the `main` tool like this:

View file

@ -3,4 +3,4 @@
# Temporary script - will be removed in the future # Temporary script - will be removed in the future
# #
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins --top_k 10000 --temp 0.96 --repeat_penalty 1 -t 7 ./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7

6
chat.sh Executable file
View file

@ -0,0 +1,6 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
./main -m ./models/7B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt

View file

@ -10,25 +10,26 @@
# - Name (char[name_length]) # - Name (char[name_length])
# - Data (float[n_dims]) # - Data (float[n_dims])
# #
# By default, the bigger matrices are converted to 16-bit floats.
# This can be disabled by adding the "use-f32" CLI argument.
#
# At the start of the ggml file we write the model parameters # At the start of the ggml file we write the model parameters
# and vocabulary. # and vocabulary.
# #
import argparse import argparse
import os
import sys import sys
import json import json
import struct import struct
import numpy as np import numpy as np
import torch import torch
from sentencepiece import SentencePieceProcessor from sentencepiece import SentencePieceProcessor
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file') parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
parser.add_argument('dir_model', help='directory containing the model checkpoint') parser.add_argument('dir_model', help='directory containing the model checkpoint')
parser.add_argument('ftype', type=int, choices=[0, 1], default=1, help='file type (0: float32, 1: float16)') parser.add_argument('ftype', help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
parser.add_argument('vocab_only', help='only write vocab to file', type=int, default=0, nargs='?')
return parser.parse_args() return parser.parse_args()
def get_n_parts(dim): def get_n_parts(dim):
@ -44,8 +45,14 @@ def get_n_parts(dim):
def load_hparams_and_tokenizer(dir_model): def load_hparams_and_tokenizer(dir_model):
# `dir_model` is something like `models/7B` or `models/7B/`.
# "tokenizer.model" is expected under model's parent dir.
# When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
# Let's use the model's parent dir directly.
model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
fname_hparams = f"{dir_model}/params.json" fname_hparams = f"{dir_model}/params.json"
fname_tokenizer = f"{dir_model}/../tokenizer.model" fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
with open(fname_hparams, "r") as f: with open(fname_hparams, "r") as f:
hparams = json.load(f) hparams = json.load(f)
@ -60,7 +67,7 @@ def write_header(fout, hparams, ftype):
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"] keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
values = [ values = [
0x67676d66, # magic: ggml in hex 0x67676d66, # magic: ggmf in hex
1, # file version 1, # file version
*[hparams[key] for key in keys], *[hparams[key] for key in keys],
hparams["dim"] // hparams["n_heads"], # rot (obsolete) hparams["dim"] // hparams["n_heads"], # rot (obsolete)
@ -127,6 +134,29 @@ def main():
ftype_str = ["f32", "f16"] ftype_str = ["f32", "f16"]
hparams, tokenizer = load_hparams_and_tokenizer(dir_model) hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
print(args)
# if only writing vocab to file
if args.vocab_only:
fname_model = f"{dir_model}/consolidated.00.pth"
fname_out = f"{dir_model}/ggml-vocab.bin"
print(f"Extracting only the vocab from '{fname_model}'\n")
model = torch.load(fname_model, map_location="cpu")
with open(fname_out, "wb") as fout:
fout.write(struct.pack("i", hparams["vocab_size"]))
write_tokens(fout, tokenizer)
del model
print(f"Done. Output file: {fname_out}\n")
return
n_parts = get_n_parts(hparams["dim"]) n_parts = get_n_parts(hparams["dim"])
for p in range(n_parts): for p in range(n_parts):
@ -144,6 +174,7 @@ def main():
process_and_write_variables(fout, model, ftype) process_and_write_variables(fout, model, ftype)
del model del model
print(f"Done. Output file: {fname_out}, (part {p})\n") print(f"Done. Output file: {fname_out}, (part {p})\n")
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -34,6 +34,7 @@
cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
chmod +x $out/bin/convert-pth-to-ggml chmod +x $out/bin/convert-pth-to-ggml
''; '';
meta.mainProgram = "llama";
}; };
devShells.default = pkgs.mkShell { devShells.default = pkgs.mkShell {
packages = with pkgs; [ packages = with pkgs; [

80
ggml.c
View file

@ -2,7 +2,7 @@
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
#include <alloca.h> #include <alloca.h>
#endif #endif
@ -361,7 +361,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
// AVX routines provided by GH user Const-me // AVX routines provided by GH user Const-me
// ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600 // ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600
#if __AVX2__ #if __AVX2__ || __AVX512F__
// Unpack 32 4-bit fields into 32 bytes // Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
static inline __m256i bytesFromNibbles( const uint8_t* rsi ) static inline __m256i bytesFromNibbles( const uint8_t* rsi )
@ -397,7 +397,6 @@ static inline __m128i packNibbles( __m256i bytes )
} }
#endif #endif
// method 5 // method 5
// blocks of QK elements // blocks of QK elements
// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)
@ -1262,6 +1261,47 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
*s = sumf; *s = sumf;
} }
#if __AVX512F__ && QK == 32
static inline __m512 dot_q4_0_oneblock_avx512(
__m512 acc,
const uint8_t * pd0,
const uint8_t * pd1,
const uint8_t * pb0,
const uint8_t * pb1,
size_t bs,
int i
) {
const float * d0_0 = (const float *) (pd0 + i*bs);
const float * d1_0 = (const float *) (pd1 + i*bs);
const uint8_t * restrict p0 = pb0 + (i+0)*bs;
const uint8_t * restrict p1 = pb1 + (i+0)*bs;
// Compute combined scale for the block
float scaleScalar = d0_0[0] * d1_0[0];
__m512 scale = _mm512_set1_ps( scaleScalar );
__m256i bx = bytesFromNibbles( p0 );
__m256i by = bytesFromNibbles( p1 );
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
const __m256i off = _mm256_set1_epi8( 8 );
bx = _mm256_sub_epi8( bx, off );
by = _mm256_sub_epi8( by, off );
// Sign-extend 16 signed bytes into int16_t
__m512i x32 = _mm512_cvtepi8_epi16( bx );
__m512i y32 = _mm512_cvtepi8_epi16( by );
// Compute products of int16_t integers, add pairwise
__m512i i64 = _mm512_madd_epi16( x32, y32 );
// Convert int32_t to float
__m512 p = _mm512_cvtepi32_ps( i64 );
// Apply the scale, and accumulate
return _mm512_fmadd_ps( scale, p, acc );
}
#endif
inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {
ggml_float sumf = 0.0; ggml_float sumf = 0.0;
@ -1417,6 +1457,40 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
#else #else
#error "not implemented for QK" #error "not implemented for QK"
#endif #endif
#elif defined(__AVX512F__)
#if QK == 32
// Initialize accumulator with zeros
__m512 acc0 = _mm512_setzero_ps();
__m512 acc1 = _mm512_setzero_ps();
const int superblock_size = 8;
const int superblock_count = nb / superblock_size;
const int remainder = nb % superblock_size;
for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) {
int i = superblock_ix * superblock_size;
acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+0 );
acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+1 );
acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+2 );
acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+3 );
acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+4 );
acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+5 );
acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+6 );
acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+7 );
}
// Remainders
for (int i = superblock_count * superblock_size; i < nb; ++i) {
acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i );
}
// Horizontal sum of all lanes of the accumulator
sumf = _mm512_reduce_add_ps( acc0 ) + _mm512_reduce_add_ps( acc1 );
#else
#error "not implemented for QK"
#endif
#elif defined(__AVX2__) #elif defined(__AVX2__)
#if QK == 32 #if QK == 32
const size_t countBlocks = nb; const size_t countBlocks = nb;

110
main.cpp
View file

@ -20,6 +20,13 @@
#include <signal.h> #include <signal.h>
#endif #endif
#if defined (_WIN32)
#pragma comment(lib,"kernel32.lib")
extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
#endif
#define ANSI_COLOR_RED "\x1b[31m" #define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m" #define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m" #define ANSI_COLOR_YELLOW "\x1b[33m"
@ -90,7 +97,8 @@ struct llama_model {
}; };
// load the model's weights from a file // load the model's weights from a file
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32) {
bool llama_model_load(const std::string & fname, llama_model & model, llama_vocab & vocab, int n_ctx, int n_parts, ggml_type memory_type = GGML_TYPE_F32) {
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
std::vector<char> f_buf(1024*1024); std::vector<char> f_buf(1024*1024);
@ -106,12 +114,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
{ {
uint32_t magic; uint32_t magic;
fin.read((char *) &magic, sizeof(magic)); fin.read((char *) &magic, sizeof(magic));
if (magic == 0x67676d6c) { if (magic == FILE_MAGIC_UNVERSIONED) {
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n", fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
__func__, fname.c_str()); __func__, fname.c_str());
return false; return false;
} }
if (magic != 0x67676d66) { if (magic != FILE_MAGIC) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
return false; return false;
} }
@ -119,15 +127,14 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
uint32_t format_version; uint32_t format_version;
fin.read((char *) &format_version, sizeof(format_version)); fin.read((char *) &format_version, sizeof(format_version));
if (format_version != 1) { if (format_version != FILE_VERSION) {
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n", fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
__func__, fname.c_str(), format_version); __func__, fname.c_str(), format_version, FILE_VERSION);
return false; return false;
} }
} }
int n_ff = 0; int n_ff = 0;
int n_parts = 0;
// load hparams // load hparams
{ {
@ -145,7 +152,10 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
hparams.n_ctx = n_ctx; hparams.n_ctx = n_ctx;
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
if (n_parts < 1) {
n_parts = LLAMA_N_PARTS.at(hparams.n_embd); n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
}
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
@ -162,12 +172,20 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
// load vocab // load vocab
{ {
std::string word; std::string word;
std::vector<char> tmp(64);
for (int i = 0; i < model.hparams.n_vocab; i++) { for (int i = 0; i < model.hparams.n_vocab; i++) {
uint32_t len; uint32_t len;
fin.read((char *) &len, sizeof(len)); fin.read((char *) &len, sizeof(len));
word.resize(len); word.resize(len);
fin.read((char *) word.data(), len); if (len > 0) {
tmp.resize(len);
fin.read(tmp.data(), len);
word.assign(tmp.data(), len);
} else {
word.clear();
}
float score; float score;
fin.read((char *) &score, sizeof(score)); fin.read((char *) &score, sizeof(score));
@ -175,10 +193,6 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
vocab.token_to_id[word] = i; vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word; vocab.id_to_token[i] = word;
vocab.score[i] = score; vocab.score[i] = score;
//if (i < 30000) {
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
//}
} }
} }
@ -544,7 +558,7 @@ bool llama_eval(
const llama_model & model, const llama_model & model,
const int n_threads, const int n_threads,
const int n_past, const int n_past,
const std::vector<gpt_vocab::id> & embd_inp, const std::vector<llama_vocab::id> & embd_inp,
std::vector<float> & embd_w, std::vector<float> & embd_w,
size_t & mem_per_token, size_t & mem_per_token,
bool return_all_logits = false) { bool return_all_logits = false) {
@ -786,11 +800,11 @@ std::vector<double> softmax(const std::vector<float>& logits) {
return probs; return probs;
} }
void perplexity(const gpt_vocab &vocab, const llama_model &model, const gpt_params &params, size_t mem_per_token) { void perplexity(const llama_vocab &vocab, const llama_model &model, const gpt_params &params, size_t mem_per_token) {
// Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
// Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw` // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
// Output: `perplexity: 13.5106 [114/114]` // Output: `perplexity: 13.5106 [114/114]`
std::vector<gpt_vocab::id> tokens = ::llama_tokenize(vocab, params.prompt, true); std::vector<llama_vocab::id> tokens = ::llama_tokenize(vocab, params.prompt, true);
int count = 0; int count = 0;
double nll = 0.0; double nll = 0.0;
@ -799,7 +813,7 @@ void perplexity(const gpt_vocab &vocab, const llama_model &model, const gpt_para
for (int i = 0; i < seq_count; ++i) { for (int i = 0; i < seq_count; ++i) {
int start = i * params.n_ctx; int start = i * params.n_ctx;
int end = start + params.n_ctx - 1; int end = start + params.n_ctx - 1;
std::vector<gpt_vocab::id> embd(tokens.begin() + start, tokens.begin() + end); std::vector<llama_vocab::id> embd(tokens.begin() + start, tokens.begin() + end);
std::vector<float> logits; std::vector<float> logits;
auto start_t = std::chrono::high_resolution_clock::now(); auto start_t = std::chrono::high_resolution_clock::now();
if (!llama_eval(model, params.n_threads, 0, embd, logits, mem_per_token, true)) { if (!llama_eval(model, params.n_threads, 0, embd, logits, mem_per_token, true)) {
@ -908,14 +922,14 @@ int main(int argc, char ** argv) {
int64_t t_load_us = 0; int64_t t_load_us = 0;
gpt_vocab vocab; llama_vocab vocab;
llama_model model; llama_model model;
// load the model // load the model
{ {
const ggml_type memory_type = params.memory_f16 ? GGML_TYPE_F16 : GGML_TYPE_F32; const ggml_type memory_type = params.memory_f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_time_us();
if (!llama_model_load(params.model, model, vocab, params.n_ctx, memory_type)) { if (!llama_model_load(params.model, model, vocab, params.n_ctx, params.n_parts, memory_type)) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return 1; return 1;
} }
@ -949,13 +963,13 @@ int main(int argc, char ** argv) {
// Add a space in front of the first character to match OG llama tokenizer behavior // Add a space in front of the first character to match OG llama tokenizer behavior
params.prompt.insert(0, 1, ' '); params.prompt.insert(0, 1, ' ');
// tokenize the prompt // tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true); std::vector<llama_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
// prefix & suffix for instruct mode // prefix & suffix for instruct mode
const std::vector<gpt_vocab::id> inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true); const std::vector<llama_vocab::id> inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true);
const std::vector<gpt_vocab::id> inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false); const std::vector<llama_vocab::id> inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false);
// in instruct mode, we inject a prefix and a suffix to each input by the user // in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) { if (params.instruct) {
@ -963,15 +977,8 @@ int main(int argc, char ** argv) {
params.antiprompt.push_back("### Instruction:\n\n"); params.antiprompt.push_back("### Instruction:\n\n");
} }
// tokenize the reverse prompt
std::vector<std::vector<gpt_vocab::id>> antipromptv_inp;
for (auto antiprompt : params.antiprompt) {
antipromptv_inp.push_back(::llama_tokenize(vocab, antiprompt, false));
}
// enable interactive mode if reverse prompt is specified // enable interactive mode if reverse prompt is specified
if (antipromptv_inp.size() != 0) { if (params.antiprompt.size() != 0) {
params.interactive = true; params.interactive = true;
} }
@ -995,25 +1002,19 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: interactive mode on.\n", __func__); fprintf(stderr, "%s: interactive mode on.\n", __func__);
if(antipromptv_inp.size()) { if(params.antiprompt.size()) {
for (size_t apindex = 0; apindex < antipromptv_inp.size(); ++apindex) { for (auto antiprompt : params.antiprompt) {
auto antiprompt_inp = antipromptv_inp.at(apindex); fprintf(stderr, "Reverse prompt: '%s'\n", antiprompt.c_str());
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.at(apindex).c_str());
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
}
fprintf(stderr, "\n");
} }
} }
} }
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "\n\n"); fprintf(stderr, "\n\n");
std::vector<gpt_vocab::id> embd; std::vector<llama_vocab::id> embd;
int last_n_size = params.repeat_last_n; int last_n_size = params.repeat_last_n;
std::vector<gpt_vocab::id> last_n_tokens(last_n_size); std::vector<llama_vocab::id> last_n_tokens(last_n_size);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0); std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
if (params.interactive) { if (params.interactive) {
@ -1033,6 +1034,14 @@ int main(int argc, char ** argv) {
// set the color for the prompt which will be output initially // set the color for the prompt which will be output initially
if (params.use_color) { if (params.use_color) {
#if defined (_WIN32)
// Enable ANSI colors on Windows 10+
unsigned long dwMode = 0;
void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
}
#endif
printf(ANSI_COLOR_YELLOW); printf(ANSI_COLOR_YELLOW);
} }
@ -1052,7 +1061,7 @@ int main(int argc, char ** argv) {
n_past += embd.size(); n_past += embd.size();
embd.clear(); embd.clear();
if (embd_inp.size() <= input_consumed) { if ((int) embd_inp.size() <= input_consumed) {
// out of user input, sample next token // out of user input, sample next token
const float top_k = params.top_k; const float top_k = params.top_k;
const float top_p = params.top_p; const float top_p = params.top_p;
@ -1061,7 +1070,7 @@ int main(int argc, char ** argv) {
const int n_vocab = model.hparams.n_vocab; const int n_vocab = model.hparams.n_vocab;
gpt_vocab::id id = 0; llama_vocab::id id = 0;
{ {
const int64_t t_start_sample_us = ggml_time_us(); const int64_t t_start_sample_us = ggml_time_us();
@ -1089,7 +1098,7 @@ int main(int argc, char ** argv) {
--remaining_tokens; --remaining_tokens;
} else { } else {
// some user input remains from prompt or interaction, forward it to processing // some user input remains from prompt or interaction, forward it to processing
while (embd_inp.size() > input_consumed) { while ((int) embd_inp.size() > input_consumed) {
embd.push_back(embd_inp[input_consumed]); embd.push_back(embd_inp[input_consumed]);
last_n_tokens.erase(last_n_tokens.begin()); last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[input_consumed]); last_n_tokens.push_back(embd_inp[input_consumed]);
@ -1114,11 +1123,16 @@ int main(int argc, char ** argv) {
// in interactive mode, and not currently processing queued inputs; // in interactive mode, and not currently processing queued inputs;
// check if we should prompt the user for more // check if we should prompt the user for more
if (params.interactive && embd_inp.size() <= input_consumed) { if (params.interactive && (int) embd_inp.size() <= input_consumed) {
// check for reverse prompt // check for reverse prompt
for (auto antiprompt_inp : antipromptv_inp) { std::string last_output;
if (antiprompt_inp.size() && std::equal(antiprompt_inp.rbegin(), antiprompt_inp.rend(), last_n_tokens.rbegin())) { for (auto id : last_n_tokens) {
// reverse prompt found last_output += vocab.id_to_token[id];
}
// Check if each of the reverse prompts appears at the end of the output.
for (std::string antiprompt : params.antiprompt) {
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
is_interacting = true; is_interacting = true;
break; break;
} }
@ -1147,7 +1161,7 @@ int main(int argc, char ** argv) {
} while (another_line); } while (another_line);
if (params.use_color) printf(ANSI_COLOR_RESET); if (params.use_color) printf(ANSI_COLOR_RESET);
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buffer, false); std::vector<llama_vocab::id> line_inp = ::llama_tokenize(vocab, buffer, false);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
if (params.instruct) { if (params.instruct) {

BIN
models/ggml-vocab.bin Normal file

Binary file not shown.

View file

@ -44,7 +44,7 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
return false; return false;
} }
gpt_vocab vocab; llama_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str()); printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
@ -64,12 +64,12 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
{ {
uint32_t magic; uint32_t magic;
finp.read((char *) &magic, sizeof(magic)); finp.read((char *) &magic, sizeof(magic));
if (magic == 0x67676d6c) { if (magic == FILE_MAGIC_UNVERSIONED) {
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n", fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
__func__, fname_inp.c_str()); __func__, fname_inp.c_str());
return false; return false;
} }
if (magic != 0x67676d66) { if (magic != FILE_MAGIC) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str()); fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
return false; return false;
} }
@ -79,9 +79,9 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
uint32_t format_version; uint32_t format_version;
finp.read((char *) &format_version, sizeof(format_version)); finp.read((char *) &format_version, sizeof(format_version));
if (format_version != 1) { if (format_version != FILE_VERSION) {
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n", fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ", expected %d)\n",
__func__, fname_inp.c_str(), format_version); __func__, fname_inp.c_str(), format_version, FILE_VERSION);
return false; return false;
} }

4
tests/CMakeLists.txt Normal file
View file

@ -0,0 +1,4 @@
set(TEST_TARGET test-tokenizer-0)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE utils)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)

View file

@ -0,0 +1,69 @@
#include "utils.h"
#include <cstdio>
#include <string>
#include <map>
static const std::map<std::string, std::vector<llama_vocab::id>> k_tests = {
{ "Hello World", { 1, 10994, 2787, }, },
{ " Hello World", { 1, 15043, 2787, }, },
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
};
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
return 1;
}
const std::string fname = argv[1];
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
llama_vocab vocab;
if (!llama_vocab_load(fname, vocab)) {
fprintf(stderr, "%s : failed to load vocab from: '%s'\n", __func__, fname.c_str());
return 1;
}
const int n_vocab = vocab.id_to_token.size();
if (n_vocab != 32000) {
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
return 2;
}
for (const auto & test_kv : k_tests) {
const auto res = llama_tokenize(vocab, test_kv.first, true);
bool correct = res.size() == test_kv.second.size();
for (int i = 0; i < (int) res.size() && correct; ++i) {
if (res[i] != test_kv.second[i]) {
correct = false;
}
}
if (!correct) {
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
fprintf(stderr, "%s : expected tokens: ", __func__);
for (const auto & t : test_kv.second) {
fprintf(stderr, "%6d, ", t);
}
fprintf(stderr, "\n");
fprintf(stderr, "%s : got tokens: ", __func__);
for (const auto & t : res) {
fprintf(stderr, "%6d, ", t);
}
fprintf(stderr, "\n");
return 3;
}
}
return 0;
}

179
utils.cpp
View file

@ -12,7 +12,7 @@
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <malloc.h> // using malloc.h with MSC/MINGW #include <malloc.h> // using malloc.h with MSC/MINGW
#elif !defined(__FreeBSD__) && !defined(__NetBSD__) #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
#include <alloca.h> #include <alloca.h>
#endif #endif
@ -76,6 +76,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.perplexity = true; params.perplexity = true;
} else if (arg == "--ignore-eos") { } else if (arg == "--ignore-eos") {
params.ignore_eos = true; params.ignore_eos = true;
} else if (arg == "--n_parts") {
params.n_parts = std::stoi(argv[++i]);
} else if (arg == "-h" || arg == "--help") { } else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, params); gpt_print_usage(argc, argv, params);
exit(0); exit(0);
@ -118,6 +120,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n"); fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n");
fprintf(stderr, " --memory_f16 use f16 instead of f32 for memory key+value\n"); fprintf(stderr, " --memory_f16 use f16 instead of f32 for memory key+value\n");
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " -m FNAME, --model FNAME\n");
@ -243,61 +246,6 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
return result; return result;
} }
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vector<std::string> words;
// first split the text into words
{
std::string str = text;
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
std::regex re(pat);
std::smatch m;
while (std::regex_search(str, m, re)) {
for (auto x : m) {
words.push_back(x);
}
str = m.suffix();
}
}
// find the longest tokens that form the words:
std::vector<gpt_vocab::id> tokens;
for (const auto & word : words) {
if (word.size() == 0) continue;
int i = 0;
int n = word.size();
while (i < n) {
int j = n;
while (j > i) {
auto it = vocab.token_to_id.find(word.substr(i, j-i));
if (it != vocab.token_to_id.end()) {
tokens.push_back(it->second);
i = j;
break;
}
--j;
}
if (i == n) {
break;
}
if (j == i) {
auto sub = word.substr(i, 1);
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
tokens.push_back(vocab.token_to_id.at(sub));
} else {
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
}
++i;
}
}
}
return tokens;
}
static size_t utf8_len(char src) { static size_t utf8_len(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t highbits = static_cast<uint8_t>(src) >> 4; uint8_t highbits = static_cast<uint8_t>(src) >> 4;
@ -308,7 +256,8 @@ struct llama_sp_symbol {
using index = int; using index = int;
index prev; index prev;
index next; index next;
std::string_view text; const char * text;
size_t n;
}; };
struct llama_sp_bigram { struct llama_sp_bigram {
@ -325,19 +274,23 @@ struct llama_sp_bigram {
size_t size; size_t size;
}; };
// original implementation:
// https://github.com/ggerganov/llama.cpp/commit/074bea2eb1f1349a0118239c4152914aecaa1be4
struct llama_tokenizer { struct llama_tokenizer {
llama_tokenizer(const gpt_vocab & vocab): vocab_(vocab) {} llama_tokenizer(const llama_vocab & vocab): vocab_(vocab) {}
void tokenize(std::string_view text, std::vector<gpt_vocab::id> & output) { void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
// split string into utf8 chars // split string into utf8 chars
int index = 0; int index = 0;
while (!text.empty()) { size_t offs = 0;
while (offs < text.size()) {
llama_sp_symbol sym; llama_sp_symbol sym;
size_t char_len = std::min(text.size(), utf8_len(text.data()[0])); size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
sym.text = std::string_view(text.data(), char_len); sym.text = text.c_str() + offs;
sym.n = char_len;
offs += char_len;
sym.prev = index - 1; sym.prev = index - 1;
text.remove_prefix(char_len); sym.next = offs == text.size() ? -1 : index + 1;
sym.next = text.empty() ? -1 : index + 1;
index++; index++;
symbols_.emplace_back(std::move(sym)); symbols_.emplace_back(std::move(sym));
} }
@ -356,14 +309,16 @@ struct llama_tokenizer {
auto & right_sym = symbols_[bigram.right]; auto & right_sym = symbols_[bigram.right];
// if one of the symbols already got merged, skip it. // if one of the symbols already got merged, skip it.
if (left_sym.text.empty() || right_sym.text.empty() || if (left_sym.n == 0 || right_sym.n == 0 ||
left_sym.text.size() + right_sym.text.size() != bigram.size) { left_sym.n + right_sym.n != bigram.size) {
continue; continue;
} }
// merge the right sym into the left one // merge the right sym into the left one
left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size()); left_sym.n += right_sym.n;
right_sym.text = std::string_view(""); right_sym.n = 0;
//printf("left = '%*s' size = %zu\n", (int) left_sym.n, left_sym.text, bigram.size);
// remove the right sym from the chain // remove the right sym from the chain
left_sym.next = right_sym.next; left_sym.next = right_sym.next;
@ -377,13 +332,13 @@ struct llama_tokenizer {
} }
for (int i = 0; i != -1; i = symbols_[i].next) { for (int i = 0; i != -1; i = symbols_[i].next) {
auto& symbol = symbols_[i]; auto & symbol = symbols_[i];
auto token = vocab_.token_to_id.find(std::string(symbol.text)); auto token = vocab_.token_to_id.find(std::string(symbol.text, symbol.n));
if (token == vocab_.token_to_id.end()) { if (token == vocab_.token_to_id.end()) {
// output any symbols that did not form tokens as bytes. // output any symbols that did not form tokens as bytes.
for (int j = 0; j < symbol.text.size(); ++j) { for (int j = 0; j < (int) symbol.n; ++j) {
gpt_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3; llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
output.push_back(token_id); output.push_back(token_id);
} }
} else { } else {
@ -398,8 +353,8 @@ private:
return; return;
} }
std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size()); const std::string text = std::string(symbols_[left].text, symbols_[left].n + symbols_[right].n);
auto token = vocab_.token_to_id.find(std::string(text)); auto token = vocab_.token_to_id.find(text);
if (token == vocab_.token_to_id.end()) { if (token == vocab_.token_to_id.end()) {
return; return;
@ -419,14 +374,52 @@ private:
work_queue_.push(bigram); work_queue_.push(bigram);
} }
const gpt_vocab & vocab_; const llama_vocab & vocab_;
std::vector<llama_sp_symbol> symbols_; std::vector<llama_sp_symbol> symbols_;
llama_sp_bigram::queue work_queue_; llama_sp_bigram::queue work_queue_;
}; };
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos) { // TODO: temporary code duplication with llama.cpp
// will resolve after #77 is merged
bool llama_vocab_load(const std::string & fname, llama_vocab & vocab) {
std::ifstream fin(fname, std::ios::binary);
if (!fin.is_open()) {
return false;
}
int n_vocab = 0;
fin.read((char *) &n_vocab, sizeof(n_vocab));
std::string word;
std::vector<char> tmp(64);
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
fin.read((char *) &len, sizeof(len));
word.resize(len);
if (len > 0) {
tmp.resize(len);
fin.read(tmp.data(), len);
word.assign(tmp.data(), len);
} else {
word.clear();
}
float score;
fin.read((char *) &score, sizeof(score));
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
vocab.score[i] = score;
}
return true;
}
std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos) {
llama_tokenizer tokenizer(vocab); llama_tokenizer tokenizer(vocab);
std::vector<gpt_vocab::id> output; std::vector<llama_vocab::id> output;
if (text.size() == 0) { if (text.size() == 0) {
return output; return output;
@ -440,42 +433,22 @@ std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_v
return output; return output;
} }
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k) {
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
vocab.token_to_id = ::json_parse(fname);
for (const auto & kv : vocab.token_to_id) {
vocab.id_to_token[kv.second] = kv.first;
}
printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
// print the vocabulary
//for (auto kv : vocab.token_to_id) {
// printf("'%s' -> %d\n", kv.first.data(), kv.second);
//}
return true;
}
void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k) {
// find the top K tokens // find the top K tokens
std::partial_sort( std::partial_sort(
logits_id.begin(), logits_id.begin(),
logits_id.begin() + top_k, logits_id.end(), logits_id.begin() + top_k, logits_id.end(),
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) { [](const std::pair<double, llama_vocab::id> & a, const std::pair<double, llama_vocab::id> & b) {
return a.first > b.first; return a.first > b.first;
}); });
logits_id.resize(top_k); logits_id.resize(top_k);
} }
gpt_vocab::id llama_sample_top_p_top_k( llama_vocab::id llama_sample_top_p_top_k(
const gpt_vocab & vocab, const llama_vocab & vocab,
const float * logits, const float * logits,
std::vector<gpt_vocab::id> & last_n_tokens, std::vector<llama_vocab::id> & last_n_tokens,
double repeat_penalty, double repeat_penalty,
int top_k, int top_k,
double top_p, double top_p,
@ -483,7 +456,7 @@ gpt_vocab::id llama_sample_top_p_top_k(
std::mt19937 & rng) { std::mt19937 & rng) {
int n_logits = vocab.id_to_token.size(); int n_logits = vocab.id_to_token.size();
std::vector<std::pair<double, gpt_vocab::id>> logits_id; std::vector<std::pair<double, llama_vocab::id>> logits_id;
logits_id.reserve(n_logits); logits_id.reserve(n_logits);
{ {

46
utils.h
View file

@ -17,27 +17,27 @@ struct gpt_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_predict = 128; // new tokens to predict int32_t n_predict = 128; // new tokens to predict
int32_t repeat_last_n = 64; // last n tokens to penalize int32_t repeat_last_n = 64; // last n tokens to penalize
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
int32_t n_ctx = 512; //context size int32_t n_ctx = 512; //context size
bool memory_f16 = false; // use f16 instead of f32 for memory kv
// sampling parameters // sampling parameters
int32_t top_k = 40; int32_t top_k = 40;
float top_p = 0.95f; float top_p = 0.95f;
float temp = 0.80f; float temp = 0.80f;
float repeat_penalty = 1.30f; float repeat_penalty = 1.10f;
int32_t n_batch = 8; // batch size for prompt processing int32_t n_batch = 8; // batch size for prompt processing
std::string model = "models/lamma-7B/ggml-model.bin"; // model path std::string model = "models/lamma-7B/ggml-model.bin"; // model path
std::string prompt = ""; std::string prompt = "";
bool random_prompt = false; std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
bool memory_f16 = false; // use f16 instead of f32 for memory kv
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode bool interactive = false; // interactive mode
bool interactive_start = false; // reverse prompt immediately bool interactive_start = false; // reverse prompt immediately
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
bool instruct = false; // instruction mode (used for Alpaca models) bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos bool ignore_eos = false; // do not stop generating after eos
bool perplexity = false; // compute perplexity over the prompt bool perplexity = false; // compute perplexity over the prompt
@ -49,11 +49,19 @@ void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng); std::string gpt_random_prompt(std::mt19937 & rng);
//
// Model file parsing
//
#define FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
#define FILE_MAGIC 0x67676d66 // 'ggmf' in hex
#define FILE_VERSION 1
// //
// Vocab utils // Vocab utils
// //
struct gpt_vocab { struct llama_vocab {
using id = int32_t; using id = int32_t;
using token = std::string; using token = std::string;
@ -67,34 +75,22 @@ void replace(std::string & str, const std::string & needle, const std::string &
// poor-man's JSON parsing // poor-man's JSON parsing
std::map<std::string, int32_t> json_parse(const std::string & fname); std::map<std::string, int32_t> json_parse(const std::string & fname);
// split text into tokens // TODO: temporary until #77 is merged, need this now for some tokenizer tests
// bool llama_vocab_load(const std::string & fname, llama_vocab & vocab);
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
//
// Regex (Python):
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
//
// Regex (C++):
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
//
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
// ref: https://github.com/google/sentencepiece // ref: https://github.com/google/sentencepiece
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos); std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, const std::string & text, bool bos);
// load the tokens from encoder.json
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
// sample next token given probabilities for each embedding // sample next token given probabilities for each embedding
// //
// - consider only the top K tokens // - consider only the top K tokens
// - from them, consider only the top tokens with cumulative probability > P // - from them, consider only the top tokens with cumulative probability > P
// //
gpt_vocab::id llama_sample_top_p_top_k( llama_vocab::id llama_sample_top_p_top_k(
const gpt_vocab & vocab, const llama_vocab & vocab,
const float * logits, const float * logits,
std::vector<gpt_vocab::id> & last_n_tokens, std::vector<llama_vocab::id> & last_n_tokens,
double repeat_penalty, double repeat_penalty,
int top_k, int top_k,
double top_p, double top_p,
@ -102,7 +98,7 @@ gpt_vocab::id llama_sample_top_p_top_k(
std::mt19937 & rng); std::mt19937 & rng);
// filer to top K tokens from list of logits // filer to top K tokens from list of logits
void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k); void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k);
// //
// Quantization // Quantization