From bc054af97ac68a4b726e972cb283eb9565253ed5 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Sun, 3 Sep 2023 01:26:59 -0400
Subject: [PATCH 1/5] make : support overriding
 CFLAGS/CXXFLAGS/CPPFLAGS/LDFLAGS (#2886)

* make : remove unused -DGGML_BIG_ENDIAN

* make : put preprocessor stuff in CPPFLAGS

* make : pass Raspberry Pi arch flags to g++ as well

* make : support overriding CFLAGS/CXXFLAGS/CPPFLAGS/LDFLAGS

* make : fix inverted conditional
---
 Makefile | 150 ++++++++++++++++++++++++-------------------------------
 1 file changed, 66 insertions(+), 84 deletions(-)

diff --git a/Makefile b/Makefile
index 23f050c0d..e214970f8 100644
--- a/Makefile
+++ b/Makefile
@@ -67,21 +67,21 @@ OPT = -Ofast
 else
 OPT = -O3
 endif
-CFLAGS   = -I.            $(OPT) -std=c11   -fPIC
-CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
-LDFLAGS  =
+MK_CPPFLAGS = -I. -Icommon
+MK_CFLAGS   = $(CPPFLAGS) $(OPT) -std=c11   -fPIC
+MK_CXXFLAGS = $(CPPFLAGS) $(OPT) -std=c++11 -fPIC
+MK_LDFLAGS  =
 
 ifdef LLAMA_DEBUG
-	CFLAGS   += -O0 -g
-	CXXFLAGS += -O0 -g
-	LDFLAGS  += -g
+	MK_CFLAGS   += -O0 -g
+	MK_CXXFLAGS += -O0 -g
+	MK_LDFLAGS  += -g
 else
-	CFLAGS   += -DNDEBUG
-	CXXFLAGS += -DNDEBUG
+	MK_CPPFLAGS += -DNDEBUG
 endif
 
 ifdef LLAMA_SERVER_VERBOSE
-	CXXFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
+	MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
 endif
 
 ifdef LLAMA_DISABLE_LOGS
@@ -90,9 +90,9 @@ ifdef LLAMA_DISABLE_LOGS
 endif # LLAMA_DISABLE_LOGS
 
 # warnings
-CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
-			-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
-CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
+MK_CFLAGS    += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
+				-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
+MK_CXXFLAGS  += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
 
 ifeq '' '$(findstring clang++,$(CXX))'
 	# g++ only
@@ -101,29 +101,9 @@ endif
 
 # OS specific
 # TODO: support Windows
-ifeq ($(UNAME_S),Linux)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),Darwin)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),FreeBSD)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),NetBSD)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),OpenBSD)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif
-ifeq ($(UNAME_S),Haiku)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
+ifneq '' '$(filter $(UNAME_S),Linux Darwin FreeBSD NetBSD OpenBSD Haiku)'
+	MK_CFLAGS   += -pthread
+	MK_CXXFLAGS += -pthread
 endif
 
 # detect Windows
@@ -149,12 +129,11 @@ ifeq ($(_WIN32),1)
 endif
 
 ifdef LLAMA_GPROF
-	CFLAGS   += -pg
-	CXXFLAGS += -pg
+	MK_CFLAGS   += -pg
+	MK_CXXFLAGS += -pg
 endif
 ifdef LLAMA_PERF
-	CFLAGS   += -DGGML_PERF
-	CXXFLAGS += -DGGML_PERF
+	MK_CPPFLAGS += -DGGML_PERF
 endif
 
 # Architecture specific
@@ -165,16 +144,16 @@ ifndef RISCV
 
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 	# Use all CPU extensions that are available:
-	CFLAGS   += -march=native -mtune=native
-	CXXFLAGS += -march=native -mtune=native
+	MK_CFLAGS   += -march=native -mtune=native
+	MK_CXXFLAGS += -march=native -mtune=native
 
 	# Usage AVX-only
-	#CFLAGS   += -mfma -mf16c -mavx
-	#CXXFLAGS += -mfma -mf16c -mavx
+	#MK_CFLAGS   += -mfma -mf16c -mavx
+	#MK_CXXFLAGS += -mfma -mf16c -mavx
 
 	# Usage SSSE3-only (Not is SSE3!)
-	#CFLAGS   += -mssse3
-	#CXXFLAGS += -mssse3
+	#MK_CFLAGS   += -mssse3
+	#MK_CXXFLAGS += -mssse3
 endif
 
 # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
@@ -188,34 +167,33 @@ endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	# Apple M1, M2, etc.
 	# Raspberry Pi 3, 4, Zero 2 (64-bit)
-	CFLAGS   += -mcpu=native
-	CXXFLAGS += -mcpu=native
+	MK_CFLAGS   += -mcpu=native
+	MK_CXXFLAGS += -mcpu=native
 endif
 
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, Zero
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+	MK_CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+	MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 endif
 
 ifneq ($(filter armv7%,$(UNAME_M)),)
 	# Raspberry Pi 2
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+	MK_CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+	MK_CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
 
 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 3, 4, Zero 2 (32-bit)
-	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
+	MK_CFLAGS   += -mfp16-format=ieee -mno-unaligned-access
+	MK_CXXFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif
 
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
-		CFLAGS   += -mcpu=power9
-		CXXFLAGS += -mcpu=power9
-	endif
-	# Require c++23's std::byteswap for big-endian support.
-	ifeq ($(UNAME_M),ppc64)
-		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
+		MK_CFLAGS   += -mcpu=power9
+		MK_CXXFLAGS += -mcpu=power9
 	endif
 endif
 
@@ -225,12 +203,10 @@ else
 endif
 
 ifndef LLAMA_NO_K_QUANTS
-	CFLAGS   += -DGGML_USE_K_QUANTS
-	CXXFLAGS += -DGGML_USE_K_QUANTS
+	MK_CPPFLAGS += -DGGML_USE_K_QUANTS
 	OBJS     += k_quants.o
 ifdef LLAMA_QKK_64
-	CFLAGS   += -DGGML_QKK_64
-	CXXFLAGS += -DGGML_QKK_64
+	MK_CPPFLAGS += -DGGML_QKK_64
 endif
 endif
 
@@ -238,31 +214,32 @@ ifndef LLAMA_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework.
 	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
 	ifeq ($(UNAME_S),Darwin)
-		CFLAGS  += -DGGML_USE_ACCELERATE
-		LDFLAGS += -framework Accelerate
+		MK_CPPFLAGS += -DGGML_USE_ACCELERATE
+		MK_LDFLAGS  += -framework Accelerate
 	endif
 endif # LLAMA_NO_ACCELERATE
 
 ifdef LLAMA_MPI
-	CFLAGS += -DGGML_USE_MPI -Wno-cast-qual
-	CXXFLAGS += -DGGML_USE_MPI -Wno-cast-qual
+	MK_CPPFLAGS += -DGGML_USE_MPI
+	MK_CFLAGS   += -Wno-cast-qual
+	MK_CXXFLAGS += -Wno-cast-qual
 	OBJS     += ggml-mpi.o
 endif # LLAMA_MPI
 
 ifdef LLAMA_OPENBLAS
-	CFLAGS  += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags openblas)
-	LDFLAGS += $(shell pkg-config --libs openblas)
+	MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
+	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
+	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
 endif # LLAMA_OPENBLAS
 
 ifdef LLAMA_BLIS
-	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
-	LDFLAGS += -lblis -L/usr/local/lib
+	MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis
+	MK_LDFLAGS  += -lblis -L/usr/local/lib
 endif # LLAMA_BLIS
 
 ifdef LLAMA_CUBLAS
-	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
-	CXXFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
-	LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
+	MK_CPPFLAGS  += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	MK_LDFLAGS   += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	OBJS      += ggml-cuda.o
 	NVCCFLAGS = --forward-unknown-to-host-compiler -use_fast_math
 ifdef LLAMA_CUDA_NVCC
@@ -313,14 +290,15 @@ endif # LLAMA_CUBLAS
 
 ifdef LLAMA_CLBLAST
 
-	CFLAGS   += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
-	CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL)
+	MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
+	MK_CFLAGS   += $(shell pkg-config --cflags-only-other clblast OpenCL)
+	MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
 
 	# Mac provides OpenCL as a framework
 	ifeq ($(UNAME_S),Darwin)
-		LDFLAGS += -lclblast -framework OpenCL
+		MK_LDFLAGS += -lclblast -framework OpenCL
 	else
-		LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
+		MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
 	endif
 	OBJS    += ggml-opencl.o
 
@@ -335,10 +313,9 @@ ifdef LLAMA_HIPBLAS
 	LLAMA_CUDA_DMMV_X       ?= 32
 	LLAMA_CUDA_MMV_Y        ?= 1
 	LLAMA_CUDA_KQUANTS_ITER ?= 2
-	CFLAGS      += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
-	CXXFLAGS    += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
-	LDFLAGS     += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
-	LDFLAGS		+= -lhipblas -lamdhip64 -lrocblas
+	MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
+	MK_LDFLAGS  += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+	MK_LDFLAGS	+= -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
 	HIPFLAGS    += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
 	HIPFLAGS    += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
@@ -353,10 +330,9 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 endif # LLAMA_HIPBLAS
 
 ifdef LLAMA_METAL
-	CFLAGS   += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
-	CXXFLAGS += -DGGML_USE_METAL
-	LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-	OBJS     += ggml-metal.o
+	MK_CPPFLAGS += -DGGML_USE_METAL #-DGGML_METAL_NDEBUG
+	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
+	OBJS		+= ggml-metal.o
 endif # LLAMA_METAL
 
 ifdef LLAMA_METAL
@@ -369,11 +345,17 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_MPI
 
-ifdef LLAMA_NO_K_QUANTS
+ifndef LLAMA_NO_K_QUANTS
 k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS
 
+# combine build flags with cmdline overrides
+override CPPFLAGS := $(MK_CPPFLAGS) $(CPPFLAGS)
+override CFLAGS   := $(MK_CFLAGS) $(CFLAGS)
+override CXXFLAGS := $(MK_CXXFLAGS) $(CXXFLAGS)
+override LDFLAGS  := $(MK_LDFLAGS) $(LDFLAGS)
+
 #
 # Print build information
 #

From 2753415afdaf22a18c49608bd9d93cfffc05d435 Mon Sep 17 00:00:00 2001
From: kchro3 <62481661+kchro3@users.noreply.github.com>
Date: Sat, 2 Sep 2023 22:27:25 -0700
Subject: [PATCH 2/5] swift : add missing c file to Package.swift (#2978)

---
 Package.swift | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Package.swift b/Package.swift
index 73d027c70..2c07710cd 100644
--- a/Package.swift
+++ b/Package.swift
@@ -12,7 +12,7 @@ let package = Package(
             name: "llama",
             path: ".",
             exclude: ["ggml-metal.metal"],
-            sources: ["ggml.c", "llama.cpp"],
+            sources: ["ggml.c", "llama.cpp", "ggml-alloc.c"],
             publicHeadersPath: "spm-headers",
             cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"]), .define("GGML_USE_ACCELERATE")],
             linkerSettings: [

From c42f0ec6b344e14bd81c8612ab1445b3ff77358b Mon Sep 17 00:00:00 2001
From: momonga <115213907+mmnga@users.noreply.github.com>
Date: Sun, 3 Sep 2023 14:36:28 +0900
Subject: [PATCH 3/5] examples : fix gpt-neox (#2943)

Co-authored-by: mmnga <mmnga1mmnga@gmail.com>
---
 examples/gptneox-wip/gptneox-main.cpp | 13 ++++----
 llama.cpp                             | 46 +++++++++++++++++++++++++--
 2 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/examples/gptneox-wip/gptneox-main.cpp b/examples/gptneox-wip/gptneox-main.cpp
index 04af50245..6291523f2 100644
--- a/examples/gptneox-wip/gptneox-main.cpp
+++ b/examples/gptneox-wip/gptneox-main.cpp
@@ -660,9 +660,10 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt2
 ggml_tensor * gpt_neox_ff(
         const gpt_neox_block &block,
         ggml_context * ctx0,
-        ggml_tensor * inp) {
+        ggml_tensor * inp,
+        const gpt_neox_hparams &hparams) {
 
-    ggml_tensor * cur = ggml_norm(ctx0, inp);
+    ggml_tensor * cur = ggml_norm(ctx0, inp, hparams.norm_eps);
 
     cur = ggml_add(ctx0, ggml_mul(ctx0, ggml_repeat(ctx0, block.ln_2_g, cur), cur), ggml_repeat(ctx0, block.ln_2_b, cur));
     cur = ggml_mul_mat(ctx0, block.c_mlp_fc_w, cur);
@@ -753,7 +754,7 @@ bool gpt_neox_eval(
         // self-attention
         {
             {
-                cur = ggml_norm(ctx0, inpL);
+                cur = ggml_norm(ctx0, inpL, hparams.norm_eps);
 
                 cur = ggml_add(ctx0,
                         ggml_mul(ctx0, ggml_repeat(ctx0, model.blocks[il].ln_1_g, cur), cur),
@@ -844,7 +845,7 @@ bool gpt_neox_eval(
         if (hparams.par_res == 0) {
             struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
 
-            cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF);
+            cur = gpt_neox_ff(model.blocks[il], ctx0, inpFF, hparams);
 
             // input for next layer
             inpL = ggml_add(ctx0, cur, inpFF);
@@ -853,7 +854,7 @@ bool gpt_neox_eval(
 
             // this is independent of the self-attention result, so it could be done in parallel to the self-attention
             // note here we pass inpL instead of cur
-            cur = gpt_neox_ff(model.blocks[il], ctx0, inpL);
+            cur = gpt_neox_ff(model.blocks[il], ctx0, inpL, hparams);
 
             // layer input + FF
             cur  = ggml_add(ctx0, cur, inpFF);
@@ -867,7 +868,7 @@ bool gpt_neox_eval(
 
     // norm
     {
-        inpL = ggml_norm(ctx0, inpL);
+        inpL = ggml_norm(ctx0, inpL, hparams.norm_eps);
 
         // inpL = ln_f_g*inpL + ln_f_b
         inpL = ggml_add(ctx0,
diff --git a/llama.cpp b/llama.cpp
index 3114d3311..2b0cf30f6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -325,6 +325,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_GPT2,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+        },
+    },
+    {
+        LLM_ARCH_GPTJ,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+        },
+    },
+    {
+        LLM_ARCH_GPTNEOX,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
+    {
+        LLM_ARCH_MPT,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+        },
+    },
+    {
+        LLM_ARCH_UNKNOWN,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+        },
+    },
 };
 
 static llm_arch llm_arch_from_string(const std::string & name) {
@@ -1605,9 +1643,13 @@ static void llm_load_hparams(
 
         GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
 
-        if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
-            throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
+        if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
+            if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
+                throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
+            }
         }
+        // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
+        // gpt-j n_rot = rotary_dim
     }
 
     // arch-specific KVs

From 340af42f09a80e32f4998857b4f0543e41124525 Mon Sep 17 00:00:00 2001
From: Ido S <ido.pluto@gmail.com>
Date: Sun, 3 Sep 2023 08:50:51 +0300
Subject: [PATCH 4/5] docs : add `catai` to `README.md` (#2967)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 5eda5f006..0cfd94db4 100644
--- a/README.md
+++ b/README.md
@@ -120,6 +120,7 @@ as the main playground for developing new features for the [ggml](https://github
 
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [oobabooga/text-generation-webui](https://github.com/oobabooga/text-generation-webui)
+- [withcatai/catai](https://github.com/withcatai/catai)
 
 ---
 

From cff7b0bf07cb46e1ad4fd199f6bdeb538925c8c4 Mon Sep 17 00:00:00 2001
From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
Date: Sat, 2 Sep 2023 23:52:13 -0600
Subject: [PATCH 5/5] convert.py : BPE fixes (#2938)

* convert.py: BPE fixes?

* Remove unnecessary conditional in addl token error handling
---
 convert.py | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/convert.py b/convert.py
index 6c89b5ecc..5a7483b43 100755
--- a/convert.py
+++ b/convert.py
@@ -323,15 +323,27 @@ class BpeVocab:
         self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
         added_tokens: dict[str, int]
         if fname_added_tokens is not None:
+            # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
             added_tokens = json.load(open(fname_added_tokens, encoding="utf-8"))
         else:
-            added_tokens = {}
+            # Fall back to trying to find the added tokens in tokenizer.json
+            tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json'
+            if not tokenizer_json_file.is_file():
+                added_tokens = {}
+            else:
+                tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8"))
+                added_tokens = dict(
+                    (item['content'], item['id'])
+                    for item in tokenizer_json.get('added_tokens', [])
+                    # Added tokens here can be duplicates of the main vocabulary.
+                    if item['content'] not in self.bpe_tokenizer )
 
         vocab_size: int = len(self.bpe_tokenizer)
         expected_ids    = list(range(vocab_size, vocab_size + len(added_tokens)))
         actual_ids      = sorted(added_tokens.values())
         if expected_ids != actual_ids:
-            raise Exception(f"Expected added token IDs to be sequential and start at {len(added_tokens)}; got {actual_ids}")
+            expected_end_id = vocab_size + len(actual_ids) - 1
+            raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}")
 
         items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
         self.added_tokens_list    = [text for (text, idx) in items]
@@ -345,10 +357,22 @@ class BpeVocab:
         from transformers.models.gpt2 import tokenization_gpt2  # type: ignore[import]
         byte_encoder = tokenization_gpt2.bytes_to_unicode()
         byte_decoder = {v: k for k, v in byte_encoder.items()}
+        score = 0.0
         for i, item in enumerate(tokenizer):
             text: bytes = item.encode("utf-8")
-            score: float = -i
-            yield text, score, gguf.TokenType.USER_DEFINED
+            # FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
+            if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
+                if i == 0 and text == b'<unk>':
+                    toktype = gguf.TokenType.UNKNOWN
+                elif i == 1 or i == 2:
+                    toktype = gguf.TokenType.CONTROL
+                elif i >= 3 and text.startswith(b'<0x'):
+                    toktype = gguf.TokenType.BYTE
+                else:
+                    toktype = gguf.TokenType.NORMAL
+            else:
+                toktype = gguf.TokenType.NORMAL
+            yield text, score, toktype
 
     def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
         for text in self.added_tokens_list: