Merge branch 'master' into batch_perplexity

2023-04-02 17:44:30 -07:00 · 2023-04-02 17:44:30 -07:00 · a17e745b6e
commit a17e745b6e
parent 43523220a4 a0c0516416
34 changed files with 2944 additions and 1723 deletions
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -6,7 +6,8 @@ RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip

 RUN pip install --upgrade pip setuptools wheel \
-    && pip install numpy requests sentencepiece torch tqdm
+    && pip install numpy requests sentencepiece tqdm \
+    && pip install torch --index-url https://download.pytorch.org/whl/cpu

 WORKDIR /app

--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -8,10 +8,10 @@ on:
        required: true
        type: boolean
  push:
-    paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']
  pull_request:
    types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
-    paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.c', '**/*.cpp']

 env:
 BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@ -62,7 +62,43 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --output-on-failure
+          ctest --verbose
+
+  ubuntu-latest-cmake-sanitizer:
+    runs-on: ubuntu-latest
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        build_type: [Debug, Release]
+        accelerate: [ON, OFF]
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_ACCELERATE=${{ matrix.accelerate }}
+          cmake --build . --config ${{ matrix.build_type }}
+
+      - name: Test
+        id: cmake_test
+        run: |
+          cd build
+          ctest --verbose

  macOS-latest-make:
    runs-on: macos-latest
@ -107,11 +143,21 @@ jobs:
        id: cmake_test
        run: |
          cd build
-          ctest --output-on-failure
+          ctest --verbose

  windows-latest-cmake:
    runs-on: windows-latest

+    strategy:
+      matrix:
+        include:
+         - build: 'avx2'
+           defines: ''
+         - build: 'avx'
+           defines: '-DLLAMA_AVX2=OFF'
+         - build: 'avx512'
+           defines: '-DLLAMA_AVX512=ON'
+
    steps:
      - name: Clone
        id: checkout
@ -122,14 +168,28 @@ jobs:
        run: |
          mkdir build
          cd build
-          cmake ..
+          cmake .. ${{ matrix.defines }}
          cmake --build . --config Release

+      - name: Check AVX512F support
+        id: check_avx512f
+        if: ${{ matrix.build == 'avx512' }}
+        continue-on-error: true
+        run: |
+          cd build
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $cl =  $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
+          echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
+          & $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
+          .\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
+
      - name: Test
        id: cmake_test
+        if: ${{ matrix.build != 'avx512' || env.HAS_AVX512F == '1' }} # Test AVX-512 only when possible
        run: |
          cd build
-          ctest -C Release --output-on-failure
+          ctest -C Release --verbose

      - name: Get commit hash
        id: commit
@ -140,12 +200,39 @@ jobs:
        id: pack_artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
        run: |
-          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\bin\Release\*
+          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
+
+  release:
+    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+
+    runs-on: ubuntu-latest
+
+    needs:
+      - ubuntu-latest-make
+      - ubuntu-latest-cmake
+      - macOS-latest-make
+      - macOS-latest-cmake
+      - windows-latest-cmake
+
+    steps:
+      - name: Download artifacts
+        id: download-artifact
+        uses: actions/download-artifact@v3
+
+      - name: Get commit hash
+        id: commit
+        uses: pr-mpt/actions-commit-hash@v2

      - name: Create release
        id: create_release
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: zendesk/action-create-release@v1
+        uses: anzz1/action-create-release@v1
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        with:
@ -153,15 +240,25 @@ jobs:

      - name: Upload release
        id: upload_release
-        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-release-asset@v1
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        uses: actions/github-script@v3
        with:
-          upload_url: ${{ steps.create_release.outputs.upload_url }} 
-          asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
-          asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
-          asset_content_type: application/octet-stream
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const path = require('path');
+            const fs = require('fs');
+            const release_id = '${{ steps.create_release.outputs.id }}';
+            for (let file of await fs.readdirSync('./artifact')) {
+              if (path.extname(file) === '.zip') {
+                console.log('uploadReleaseAsset', file);
+                await github.repos.uploadReleaseAsset({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  release_id: release_id,
+                  name: file,
+                  data: await fs.readFileSync(`./artifact/${file}`)
+                });
+              }
+            }

 #  ubuntu-latest-gcc:
 #    runs-on: ubuntu-latest
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -49,6 +49,7 @@ jobs:
        with:
          context: .
          push: true
+          platforms: linux/amd64,linux/arm64
          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}

@ -57,5 +58,6 @@ jobs:
        with:
          context: .
          push: ${{ github.event_name == 'push' }}
+          platforms: linux/amd64,linux/arm64
          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,7 @@
 .vscode/
 .DS_Store

+.build/
 build/
 build-em/
 build-debug/
@ -20,9 +21,15 @@ models/*
 /quantize
 /result
 /perplexity
+/embedding
+/Pipfile

 arm_neon.h
 compile_commands.json

 .envrc
 .direnv/
+
+.venv
+__pycache__
+.swiftpm
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -54,6 +54,7 @@ option(LLAMA_SANITIZE_UNDEFINED     "llama: enable undefined sanitizer"
 # instruction set specific
 option(LLAMA_AVX                    "llama: enable AVX"                                     ON)
 option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
+option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
 option(LLAMA_FMA                    "llama: enable FMA"                                     ON)

 # 3rd party libs
@ -67,7 +68,9 @@ option(LLAMA_BUILD_EXAMPLES         "llama: build examples" ${LLAMA_STANDALONE})
 # Compile flags
 #

+set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
+set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
@ -75,14 +78,17 @@ find_package(Threads REQUIRED)
 if (NOT MSVC)
    if (LLAMA_SANITIZE_THREAD)
        add_compile_options(-fsanitize=thread)
+        link_libraries(-fsanitize=thread)
    endif()

    if (LLAMA_SANITIZE_ADDRESS)
        add_compile_options(-fsanitize=address -fno-omit-frame-pointer)
+        link_libraries(-fsanitize=address)
    endif()

    if (LLAMA_SANITIZE_UNDEFINED)
        add_compile_options(-fsanitize=undefined)
+        link_libraries(-fsanitize=undefined)
    endif()
 endif()

@ -120,8 +126,9 @@ if (LLAMA_ALL_WARNINGS)
            -Wall
            -Wextra
            -Wpedantic
-            -Wshadow
            -Wcast-qual
+            -Wdouble-promotion
+            -Wshadow
            -Wstrict-prototypes
            -Wpointer-arith
            -Wno-unused-function
@ -131,6 +138,7 @@ if (LLAMA_ALL_WARNINGS)
            -Wextra
            -Wpedantic
            -Wcast-qual
+            -Wno-unused-function
        )
    else()
        # todo : msvc
@ -185,7 +193,9 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
 elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
    message(STATUS "x86 detected")
    if (MSVC)
-        if (LLAMA_AVX2)
+        if (LLAMA_AVX512)
+            add_compile_options(/arch:AVX512)
+        elseif (LLAMA_AVX2)
            add_compile_options(/arch:AVX2)
        elseif (LLAMA_AVX)
            add_compile_options(/arch:AVX)
@ -201,6 +211,12 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$")
        if (LLAMA_AVX2)
            add_compile_options(-mavx2)
        endif()
+        if (LLAMA_AVX512)
+            add_compile_options(-mavx512f)
+            # add_compile_options(-mavx512cd)
+            # add_compile_options(-mavx512dq)
+            # add_compile_options(-mavx512bw)
+        endif()
    endif()
 else()
    # TODO: support PowerPC
@ -239,7 +255,7 @@ endif()
 #

 if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    enable_testing()
+    include(CTest)
    add_subdirectory(tests)
 endif ()

--- a/99
+++ b/99
@ -35,6 +35,10 @@ CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =

+# warnings
+CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
+
 # OS specific
 # TODO: support Windows
 ifeq ($(UNAME_S),Linux)
@ -66,92 +70,8 @@ endif
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
-	ifeq ($(UNAME_S),Darwin)
-		CFLAGS += -mf16c
-		AVX1_M := $(shell sysctl machdep.cpu.features)
-		ifneq (,$(findstring FMA,$(AVX1_M)))
-			CFLAGS += -mfma
-		endif
-		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
-		ifneq (,$(findstring AVX2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-	else ifeq ($(UNAME_S),Linux)
-		AVX1_M := $(shell grep "avx " /proc/cpuinfo)
-		ifneq (,$(findstring avx,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
-		ifneq (,$(findstring avx2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell grep "fma " /proc/cpuinfo)
-		ifneq (,$(findstring fma,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell grep "f16c " /proc/cpuinfo)
-		ifneq (,$(findstring f16c,$(F16C_M)))
-			CFLAGS += -mf16c
-		endif
-		SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
-		ifneq (,$(findstring sse3,$(SSE3_M)))
-			CFLAGS += -msse3
-		endif
-		AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
-		ifneq (,$(findstring avx512f,$(AVX512F_M)))
-			CFLAGS += -mavx512f
-		endif
-		AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
-		ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
-			CFLAGS += -mavx512bw
-		endif
-		AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
-		ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
-			CFLAGS += -mavx512dq
-		endif
-		AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
-		ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
-			CFLAGS += -mavx512vl
-		endif
-		AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
-		ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
-			CFLAGS += -mavx512cd
-		endif
-		AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
-		ifneq (,$(findstring avx512er,$(AVX512ER_M)))
-			CFLAGS += -mavx512er
-		endif
-		AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
-		ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
-			CFLAGS += -mavx512ifma
-		endif
-		AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
-		ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
-			CFLAGS += -mavx512pf
-		endif
-	else ifeq ($(UNAME_S),Haiku)
-		AVX1_M := $(shell sysinfo -cpu | grep -w "AVX")
-		ifneq (,$(findstring AVX,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell sysinfo -cpu | grep -w "AVX2")
-		ifneq (,$(findstring AVX2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell sysinfo -cpu | grep -w "FMA")
-		ifneq (,$(findstring FMA,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell sysinfo -cpu | grep -w "F16C")
-		ifneq (,$(findstring F16C,$(F16C_M)))
-			CFLAGS += -mf16c
-		endif
-	else
-		CFLAGS += -mfma -mf16c -mavx -mavx2
-	endif
+	# Use all CPU extensions that are available:
+	CFLAGS += -march=native -mtune=native
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
@ -212,7 +132,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )

-default: main quantize perplexity
+default: main quantize perplexity embedding

 #
 # Build library
@ -228,7 +148,7 @@ common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o

 clean:
-	rm -vf *.o main quantize perplexity
+	rm -vf *.o main quantize perplexity embedding

 main: examples/main/main.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
@ -242,6 +162,9 @@ quantize: examples/quantize/quantize.cpp ggml.o llama.o
 perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
 	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)

+embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
+	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
+
 #
 # Tests
 #
--- a/Package.swift
+++ b/Package.swift
@ -0,0 +1,20 @@
+// swift-tools-version:5.3
+
+import PackageDescription
+
+let package = Package(
+    name: "llama",
+    products: [
+        .library(name: "llama", targets: ["llama"]),
+    ],
+    targets: [
+        .target(
+            name: "llama",
+            path: ".",
+            sources: ["ggml.c", "llama.cpp"],
+            publicHeadersPath: "spm-headers",
+            cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])]
+        ),
+    ],
+    cxxLanguageStandard: .cxx11
+)
--- a/README.md
+++ b/README.md
@ -1,5 +1,7 @@
 # llama.cpp

+![llama](https://user-images.githubusercontent.com/1991296/227761327-6d83e30e-2200-41a6-bfbb-f575231c54f4.png)
+
 [![Actions Status](https://github.com/ggerganov/llama.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/llama.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)

@ -8,9 +10,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 **Hot topics:**

 - [Roadmap (short-term)](https://github.com/ggerganov/llama.cpp/discussions/457)
- New C-style API is now available: https://github.com/ggerganov/llama.cpp/pull/370
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
+- Support for [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)

 ## Description

@ -35,6 +35,14 @@ Supported platforms:
 - [X] Windows (via CMake)
 - [X] Docker

+Supported models:
+
+- [X] LLaMA 🦙
+- [X] [Alpaca](https://github.com/ggerganov/llama.cpp#instruction-mode-with-alpaca)
+- [X] [GPT4All](https://github.com/ggerganov/llama.cpp#using-gpt4all)
+- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
+- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
+
 ---

 Here is a typical run using LLaMA-7B:
@ -147,8 +155,8 @@ python3 -m pip install torch numpy sentencepiece
 # convert the 7B model to ggml FP16 format
 python3 convert-pth-to-ggml.py models/7B/ 1

-# quantize the model to 4-bits
-python3 quantize.py 7B
+# quantize the model to 4-bits (using method 2 = q4_0)
+./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2

 # run the inference
 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
@ -220,6 +228,21 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 > 
 ```

+### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
+
+- Obtain the `gpt4all-lora-quantized.bin` model
+- It is distributed in the old `ggml` format which is now obsoleted
+- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
+convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
+
+  ```bash
+  python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model 
+  python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
+  ```
+  
+- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
+- The original model is saved in the same folder with a suffix `.orig`
+
 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data

 - **Under no circumstances share IPFS, magnet links, or any other links to model downloads anywhere in this respository, including in issues, discussions or pull requests. They will be immediately deleted.**
@ -246,7 +269,7 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
    
 ### Perplexity (Measuring model quality)

-You can pass `--perplexity` as a command line option to measure perplexity over the given prompt.  For more background,
+You can use the `perplexity` example to measure perplexity over the given prompt.  For more background,
 see https://huggingface.co/docs/transformers/perplexity.  However, in general, lower perplexity is better for LLMs.

 #### Latest measurements
@ -269,10 +292,10 @@ Perplexity - model options
 #### How to run

 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
-2. Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
 3. Output:
 ```
-Calculating perplexity over 655 chunks
+perplexity : calculating perplexity over 655 chunks
 24.43 seconds per pass - ETA 4.45 hours
 [1]4.5970,[2]5.1807,[3]6.0382,...
 ```
@ -280,7 +303,7 @@ And after 4.45 hours, you will have the final perplexity.

 ### Android

-You can easily run `llama.cpp` on Android device with [termux](https://play.google.com/store/apps/details?id=com.termux).
+You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
 First, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
 ```
 $ mkdir build-android
@ -289,7 +312,7 @@ $ export NDK=<your_ndk_directory>
 $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
 $ make
 ```
-Install [termux](https://play.google.com/store/apps/details?id=com.termux) on your device and run `termux-setup-storage` to get access to your SD card.
+Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
 Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:

 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
--- a/convert-ggml-to-pth.py
+++ b/convert-ggml-to-pth.py
@ -0,0 +1,299 @@
+# Author: github.com/ductai199x
+import argparse
+import os
+import struct
+
+import numpy as np
+import torch
+from numba import njit
+from tqdm.auto import tqdm
+
+
+def read_header(fin):
+    values = struct.unpack("i" * 9, fin.read(4 * 9))
+    _, _, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype = values
+    return {
+        "vocab_size": vocab_size,
+        "dim": dim,
+        "multiple_of": multiple_of,
+        "n_heads": n_heads,
+        "n_layers": n_layers,
+    }, ftype
+
+
+def read_tokens(fin, vocab_size):
+    tokens = []
+    for _ in range(vocab_size):
+        text_len = struct.unpack("i", fin.read(4))[0]
+        text_bytes = fin.read(text_len)
+        try:
+            text = text_bytes.decode()
+        except UnicodeDecodeError:
+            text = text_bytes.decode(errors="replace")
+        score = struct.unpack("f", fin.read(4))[0]
+        tokens.append((text, score))
+    return tokens
+
+
+@njit
+def dequantize_weights_numba(fin_data, n_rows, n_cols):
+    qk = 32
+    nb = n_cols // qk
+    bs = 4 + (qk // 2)
+
+    weights = np.zeros((n_rows, n_cols), dtype=np.float32)
+    data_pos = 0
+
+    for row in range(n_rows):
+        for block in range(nb):
+            d = np.frombuffer(fin_data[data_pos : data_pos + 4], dtype=np.float32)[0]
+            data_pos += 4
+            packed_values = fin_data[data_pos : data_pos + (qk // 2)]
+            data_pos += qk // 2
+
+            for i in range(qk // 2):
+                packed_value = packed_values[i]
+                v0 = np.float32((packed_value & 0b00001111) - 8) * d
+                v1 = np.float32((packed_value >> 4) - 8) * d
+
+                weights[row, block * qk + 2 * i] = v0
+                weights[row, block * qk + 2 * i + 1] = v1
+
+    return weights
+
+
+def dequantize_weights(fin, n_rows, n_cols):
+    qk = 32
+    nb = n_cols // qk
+    data_size = n_rows * n_cols // 2 + n_rows * nb * 4
+    fin_data = fin.read(data_size)
+    return dequantize_weights_numba(fin_data, n_rows, n_cols)
+
+
+def read_variables(fin):
+    model = {}
+    pbar = tqdm(total=os.path.getsize(fin.name), unit="B", unit_scale=True, desc="Reading variables")
+    while True:
+        start_pos = fin.tell()
+        try:
+            n_dims, name_length, ftype_cur = struct.unpack("iii", fin.read(4 * 3))
+        except struct.error:
+            break
+
+        shape = tuple(struct.unpack("i" * n_dims, fin.read(4 * n_dims)))
+        shape = shape[::-1]
+        name = fin.read(name_length).decode()
+
+        # ensure tensor data is aligned
+        tensor_data_offset = fin.tell()
+        tensor_data_offset = (tensor_data_offset + 31) & -32
+        fin.seek(tensor_data_offset)
+
+        if ftype_cur == 2:
+            # 4-bit quantized weights
+            dtype = np.uint8
+            data = dequantize_weights(fin, shape[0], shape[1])
+            data = data.reshape(shape)
+        elif ftype_cur == 0:
+            dtype = np.float32
+            data_size = np.prod(shape)
+            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
+        elif ftype_cur == 1:
+            dtype = np.float16
+            data_size = np.prod(shape)
+            data = np.fromfile(fin, dtype=dtype, count=data_size).reshape(shape)
+
+        model[name] = torch.tensor(data, dtype=torch.float32 if dtype == np.float32 else torch.float16)
+
+        pbar.update(fin.tell() - start_pos)
+
+    return model
+
+
+def convert_to_hf_format(model, hparams):
+    # This works for llama 7B, need to test with other models
+    n_layers = hparams["n_layers"]
+    n_heads = hparams["n_heads"]
+    dim = hparams["dim"]
+    dims_per_head = dim // n_heads
+    base = 10000.0
+    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
+
+    # permute for sliced rotary
+    def permute(w):
+        return w.view(n_heads, dim // n_heads // 2, 2, dim).transpose(1, 2).reshape(dim, dim)
+
+    state_dict = {}
+    for layer_i in range(n_layers):
+        state_dict.update(
+            {
+                f"model.layers.{layer_i}.self_attn.q_proj.weight": permute(
+                    model[f"layers.{layer_i}.attention.wq.weight"]
+                ),
+                f"model.layers.{layer_i}.self_attn.k_proj.weight": permute(
+                    model[f"layers.{layer_i}.attention.wk.weight"]
+                ),
+                f"model.layers.{layer_i}.self_attn.v_proj.weight": model[
+                    f"layers.{layer_i}.attention.wv.weight"
+                ],
+                f"model.layers.{layer_i}.self_attn.o_proj.weight": model[
+                    f"layers.{layer_i}.attention.wo.weight"
+                ],
+                f"model.layers.{layer_i}.mlp.gate_proj.weight": model[
+                    f"layers.{layer_i}.feed_forward.w1.weight"
+                ],
+                f"model.layers.{layer_i}.mlp.down_proj.weight": model[
+                    f"layers.{layer_i}.feed_forward.w2.weight"
+                ],
+                f"model.layers.{layer_i}.mlp.up_proj.weight": model[
+                    f"layers.{layer_i}.feed_forward.w3.weight"
+                ],
+                f"model.layers.{layer_i}.input_layernorm.weight": model[
+                    f"layers.{layer_i}.attention_norm.weight"
+                ],
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": model[
+                    f"layers.{layer_i}.ffn_norm.weight"
+                ],
+            }
+        )
+        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
+    state_dict.update(
+        {
+            "model.embed_tokens.weight": model["tok_embeddings.weight"],
+            "model.norm.weight": model["norm.weight"],
+            "lm_head.weight": model["output.weight"],
+        }
+    )
+
+    return state_dict
+
+
+def chat(model, hparams, llama_dir):
+    from transformers import (GenerationConfig, LlamaForCausalLM,
+                              LlamaTokenizer, StoppingCriteria,
+                              StoppingCriteriaList)
+    from transformers.models.llama.configuration_llama import LlamaConfig
+
+    class StoppingCriteriaSub(StoppingCriteria):
+        def __init__(self):
+            super().__init__()
+
+        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, stops=[]):
+            print(tokenizer.decode(input_ids[0]), end="", flush=True)
+            if input_ids[0][-1] == 13:
+                return True
+
+            return False
+
+    config = LlamaConfig(
+        vocab_size=hparams["vocab_size"],
+        dim=hparams["dim"],
+        num_hidden_layers=hparams["n_layers"],
+        num_attention_heads=hparams["n_heads"],
+    )
+
+    llama = LlamaForCausalLM(config=config)
+    llama.load_state_dict(state_dict=model, strict=True)
+    tokenizer = LlamaTokenizer.from_pretrained(llama_dir)
+
+    device = torch.device("cpu")
+    llama = llama.to(device)
+
+    ctx = """You are AI.
+This is a dialog, where User interacts with AI. AI is helpful, kind, obedient, honest, respectful, direct, concise, should try to protect User's privacy, and knows its own limits. Also, AI must answer User and AI cannot stop the conversation by itself.
+User: Hello, AI.
+AI: Hello! How can I assist you today?
+"""
+    print(ctx.rstrip("\n"))
+    while True:
+        print("-" * 60)
+        prompt = input("User: ")
+        if ctx != "":
+            ctx = f"{ctx}User: {prompt}\n"
+        else:
+            ctx = f"{prompt}\nAI:"
+
+        ctx = (ctx[-1920:]) if len(ctx) >= 2048 else ctx
+
+        print("-" * 60)
+        if len(ctx.strip()) > 0:
+            input_ids = tokenizer(ctx, return_tensors="pt")["input_ids"].to(device)
+            generation_config = GenerationConfig(
+                temperature=0.8,
+                top_p=0.95,
+                top_k=50,
+                repetition_penalty=1.1764,
+            )
+            with torch.no_grad():
+                generation_output = llama.generate(
+                    input_ids=input_ids,
+                    generation_config=generation_config,
+                    return_dict_in_generate=True,
+                    output_scores=True,
+                    max_length=2048,
+                    do_sample=True,
+                    stopping_criteria=StoppingCriteriaList([StoppingCriteriaSub()]),
+                )
+            s = generation_output.sequences[0]
+            decoded = tokenizer.decode(s)
+            ctx = f"{decoded}\n"
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir", "-i", type=str, required=True, help="The input directory containing the ggml files."
+    )
+    parser.add_argument(
+        "--prefix",
+        "-p",
+        type=str,
+        required=True,
+        help="The prefix of the ggml files (ggml-model-f16 or ggml-model-q4_0).",
+    )
+    parser.add_argument(
+        "--hf",
+        action="store_true",
+        help="Whether to save the model in the Hugging Face format. (default: False)",
+    )
+    parser.add_argument(
+        "--chat", "-c", action="store_true", help="Whether to open a chat with the model. (default: False)"
+    )
+    args = parser.parse_args()
+
+    llama_dir = os.path.abspath(f"{args.input_dir}/../")
+
+    ggml_files = sorted(
+        [f"{args.input_dir}/{f}" for f in os.listdir(args.input_dir) if f.startswith(args.prefix)]
+    )
+
+    fin = open(ggml_files[0], "rb")
+    hparams, ftype = read_header(fin)
+    tokens = read_tokens(fin, hparams["vocab_size"])
+    model = read_variables(fin)
+
+    for f in tqdm(ggml_files[1:]):
+        fin = open(f, "rb")
+        read_header(fin)
+        read_tokens(fin, hparams["vocab_size"])
+        model.update(read_variables(fin))
+
+    if args.hf:
+        model = convert_to_hf_format(model, hparams)
+
+    pth_ckpt = {
+        "state_dict": model,
+        "hparams": hparams,
+        "tokens": tokens,
+    }
+
+    torch.save(pth_ckpt, f"{args.input_dir}/{args.prefix}-to-torch.pth")
+
+    if args.chat:
+        if not args.hf:
+            model = convert_to_hf_format(model, hparams)
+        chat(model, hparams, llama_dir)
+
+
+if __name__ == "__main__":
+    main()
--- a/convert-gpt4all-to-ggml.py
+++ b/convert-gpt4all-to-ggml.py
@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+
+#
+# TODO: deduplicate GPT4All with convert-unversioned-ggml-to-ggml.py
+#
+
+# Original by https://github.com/eiz
+# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
+import argparse
+import glob
+import os
+import struct
+import sys
+from sentencepiece import SentencePieceProcessor
+
+HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Upgrade a GPT4All model to the current format')
+    parser.add_argument('gpt4all_model', help='path to gpt4all-lora-quantized.bin')
+    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
+    return parser.parse_args()
+
+def read_header(f_in):
+    struct_fmt = "i" * (3 + len(HPARAMS))
+    struct_size = struct.calcsize(struct_fmt)
+    buf = f_in.read(struct_size)
+    return struct.unpack(struct_fmt, buf)
+
+def write_header(f_out, header):
+    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
+
+    if magic != 0x67676d6c:
+        raise Exception('Invalid file magic. Must be an old style ggml file.')
+
+    values = [
+        0x67676d66, # magic: ggml in hex
+        1,          # file version
+        vocab_size,
+        dim,
+        multiple_of,
+        n_heads,
+        n_layers,
+        rot,
+        ftype
+    ]
+    f_out.write(struct.pack("i" * len(values), *values))
+
+def write_tokens(fout, tokenizer):
+    for i in range(tokenizer.vocab_size()):
+        if tokenizer.is_unknown(i):
+            text = " \u2047 ".encode()
+        elif tokenizer.is_control(i):
+            text = b""
+        elif tokenizer.is_byte(i):
+            piece = tokenizer.id_to_piece(i)
+            if len(piece) != 6:
+                print(f"Invalid token: {piece}")
+                sys.exit(1)
+            byte_value = int(piece[3:-1], 16)
+            text = struct.pack("B", byte_value)
+        else:
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        fout.write(struct.pack("f", tokenizer.get_score(i)))
+
+    # TODO: GPT4All - add extra <pad> token
+    text = "<pad>".encode()
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+    fout.write(struct.pack("f", 0.0))
+
+def read_tokens(f_in, tokenizer):
+    for i in range(tokenizer.vocab_size()):
+        len_b = f_in.read(4)
+        (length,) = struct.unpack("i", len_b)
+        f_in.read(length)
+
+def copy_all_data(f_out, f_in):
+    while True:
+        buf = f_in.read(1024 * 1024)
+        if not buf:
+            break
+        f_out.write(buf)
+
+def convert_one_file(path_in, tokenizer):
+    path_tmp = f"{path_in}.tmp"
+    path_orig= f"{path_in}.orig"
+    print(f"converting {path_in}")
+    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
+        write_header(f_out, read_header(f_in))
+        read_tokens(f_in, tokenizer)
+        write_tokens(f_out, tokenizer)
+        copy_all_data(f_out, f_in)
+    os.rename(path_in, path_orig)
+    os.rename(path_tmp, path_in)
+
+def main():
+    args = parse_args()
+
+    tokenizer = SentencePieceProcessor(args.tokenizer_model)
+
+    convert_one_file(args.gpt4all_model, tokenizer)
+
+if __name__ == "__main__":
+    main()
--- a/convert-gptq-to-ggml.py
+++ b/convert-gptq-to-ggml.py
@ -50,7 +50,7 @@ fout.write(struct.pack("i", 4))
 # This loop unchanged from convert-pth-to-ggml.py:
 for i in range(tokenizer.vocab_size()):
    if tokenizer.is_unknown(i):
-        text = " \u2047 ".encode("utf-8")
+        text = " \u2047 ".encode()
    elif tokenizer.is_control(i):
        text = b""
    elif tokenizer.is_byte(i):
@ -61,21 +61,26 @@ for i in range(tokenizer.vocab_size()):
        byte_value = int(piece[3:-1], 16)
        text = struct.pack("B", byte_value)
    else:
-        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+        text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    fout.write(struct.pack("f", tokenizer.get_score(i)))

 def write_header(shape, dst_name, ftype_cur):
-    sname = dst_name.encode('utf-8')
+    sname = dst_name.encode()
    fout.write(struct.pack("iii", len(shape), len(sname), ftype_cur))
    fout.write(struct.pack("i" * len(shape), *shape[::-1]))
    fout.write(sname)

+    # ensure tensor data is aligned
+    tensor_data_offset = fout.tell()
+    tensor_data_offset = (tensor_data_offset + 31) & -32
+    fout.seek(tensor_data_offset)
+
 def convert_non_q4(src_name, dst_name):
    v = model[src_name]
    shape = v.shape
-    print("Processing non-Q4 variable: " + src_name + " with shape: ", shape, " and type: ", v.dtype)
+    print(f"Processing non-Q4 variable: {src_name} with shape: {shape} and type: {v.dtype}")
    if len(shape) == 1:
        print("  Converting to float32")
        v = v.to(torch.float32)
@ -100,7 +105,7 @@ def convert_q4(src_name, dst_name, permute=False):
    # Each int32 item is actually 8 int4 items packed together, and it's transposed.
    shape = (qweight.shape[0], qweight.shape[1] * 8)

-    print("Processing Q4 variable: " + src_name + " with shape: ", shape)
+    print(f"Processing Q4 variable: {src_name} with shape: {shape}")

    # The output format has the int4 weights in groups of 32 rather than 8.
    # It looks like this:
@ -163,5 +168,5 @@ for i in range(n_layer):

 fout.close()

-print("Done. Output file: " + fname_out)
-print("")
+print(f"Done. Output file: {fname_out}")
+print()
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@ -1,4 +1,4 @@
-# Convert a LLaMA model checkpoint to a ggml compatible file
+# Convert a LLaMA model checkpoint to a ggjt compatible file
 #
 # Load the model using Torch
 # Iterate over all variables and write them to a binary file.
@ -24,8 +24,57 @@ import torch

 from sentencepiece import SentencePieceProcessor

-def parse_args():
+QK = 32

+GGML_TYPE_Q4_0  = 0
+GGML_TYPE_Q4_1  = 1
+GGML_TYPE_I8    = 2
+GGML_TYPE_I16   = 3
+GGML_TYPE_I32   = 4
+GGML_TYPE_F16   = 5
+GGML_TYPE_F32   = 6
+
+WTYPES = {
+    0: GGML_TYPE_F32,
+    1: GGML_TYPE_F16,
+    2: GGML_TYPE_Q4_0,
+    3: GGML_TYPE_Q4_1,
+}
+
+GGML_BLCK_SIZE = {
+    GGML_TYPE_Q4_0:  QK,
+    GGML_TYPE_Q4_1:  QK,
+    GGML_TYPE_I8:    1,
+    GGML_TYPE_I16:   1,
+    GGML_TYPE_I32:   1,
+    GGML_TYPE_F16:   1,
+    GGML_TYPE_F32:   1,
+}
+
+GGML_TYPE_SIZE = {
+    GGML_TYPE_Q4_0: 4   + QK//2,
+    GGML_TYPE_Q4_1: 4*2 + QK//2,
+    GGML_TYPE_I8:   1,
+    GGML_TYPE_I16:  2,
+    GGML_TYPE_I32:  4,
+    GGML_TYPE_F16:  2,
+    GGML_TYPE_F32:  4,
+}
+
+def ggml_nelements(shape):
+    r = 1
+    for i in shape:
+        r *= i
+    return r
+
+def ggml_nbytes(shape, ftype):
+    x = ggml_nelements(shape)
+    t = WTYPES[ftype]
+    x *= GGML_TYPE_SIZE[t]
+    x //= GGML_BLCK_SIZE[t]
+    return x
+
+def parse_args():
    parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
    parser.add_argument('dir_model',  help='directory containing the model checkpoint')
    parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
@ -33,7 +82,6 @@ def parse_args():
    return parser.parse_args()

 def get_n_parts(dim):
-
    mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
    n_parts = mappings.get(dim)
    if n_parts is None:
@ -44,30 +92,24 @@ def get_n_parts(dim):
    return n_parts

 def load_hparams_and_tokenizer(dir_model):
-
    # `dir_model` is something like `models/7B` or `models/7B/`.
    # "tokenizer.model" is expected under model's parent dir.
    # When `dir_model` is a symlink, f"{dir_model}/../tokenizer.model" would not be found.
    # Let's use the model's parent dir directly.
    model_parent_dir = os.path.dirname(os.path.normpath(dir_model))
-
    fname_hparams = f"{dir_model}/params.json"
    fname_tokenizer = f"{model_parent_dir}/tokenizer.model"
-
    with open(fname_hparams, "r") as f:
        hparams = json.load(f)
        print(hparams)
-
    tokenizer = SentencePieceProcessor(fname_tokenizer)
    hparams.update({"vocab_size": tokenizer.vocab_size()})
-
    return hparams, tokenizer

 def write_header(fout, hparams, ftype):
-
    keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
    values = [
-        0x67676d66,  # magic: ggmf in hex
+        0x67676a74,  # magic: ggjt in hex
        1, # file version
        *[hparams[key] for key in keys],
        hparams["dim"] // hparams["n_heads"],  # rot (obsolete)
@ -76,10 +118,9 @@ def write_header(fout, hparams, ftype):
    fout.write(struct.pack("i" * len(values), *values))

 def write_tokens(fout, tokenizer):
-
    for i in range(tokenizer.vocab_size()):
        if tokenizer.is_unknown(i):
-            text = " \u2047 ".encode("utf-8")
+            text = " \u2047 ".encode()
        elif tokenizer.is_control(i):
            text = b""
        elif tokenizer.is_byte(i):
@ -90,92 +131,144 @@ def write_tokens(fout, tokenizer):
            byte_value = int(piece[3:-1], 16)
            text = struct.pack("B", byte_value)
        else:
-            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
        fout.write(struct.pack("i", len(text)))
        fout.write(text)
        fout.write(struct.pack("f", tokenizer.get_score(i)))

-def process_and_write_variables(fout, model, ftype):
-
+def process_and_write_variables(fout, model, ftype, part_id, n_parts):
    for name, datao in model.items():
-
        if name.endswith("freqs"):
            continue

-        shape = datao.shape
-
-        print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
-
+        # remove dimensions with a single element
        data = datao.numpy().squeeze()
-        n_dims = len(shape)
+        partshape = data.shape
+        n_dims = len(data.shape)
+        assert n_dims in (1, 2)

-        # default type is fp16
+        print(f"Processing variable: {name} with shape: {partshape} and type: {datao.dtype}")
+
+        # coerce single-dimensional tensors from float16 to float32
        ftype_cur = 1
        if ftype == 0 or n_dims == 1:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0
+        blck_size = GGML_BLCK_SIZE[WTYPES[ftype_cur]]
+        type_size = GGML_TYPE_SIZE[WTYPES[ftype_cur]]

-        # header
-        sname = name.encode('utf-8')
-        fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
-        for dim in reversed(data.shape):
+        # determine dimension along which multipart tensor is sharded
+        #
+        # split_dim 0 regex:
+        #   - output.*
+        #   - layers.*.attention.wq.weight
+        #   - layers.*.attention.wk.weight
+        #   - layers.*.attention.wv.weight
+        #   - layers.*.feed_forward.w1.weight
+        #   - layers.*.feed_forward.w3.weight
+        #
+        # split_dim 1 regex:
+        #   - tok_embeddings.*
+        #   - layers.*.attention.wo.weight
+        #   - layers.*.feed_forward.w2.weight
+        #
+        if n_dims > 1:
+            split_dim = 1
+            if "tok_embeddings" in name:
+                split_dim = 1
+            elif "layers" in name:
+                if "attention.wo.weight" in name:
+                    split_dim = 1
+                elif "feed_forward.w2.weight" in name:
+                    split_dim = 1
+                else:
+                    split_dim = 0
+            elif "output" in name:
+                split_dim = 0
+
+        # output tensor header
+        fullshape = list(partshape)
+        if n_dims > 1:
+            fullshape[split_dim] *= n_parts
+        sname = name.encode()
+        fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
+        for dim in reversed(fullshape):
            fout.write(struct.pack("i", dim))
        fout.write(sname)

-        # data output to file
-        data.tofile(fout)
+        # ensure tensor data is aligned
+        tensor_data_offset = fout.tell()
+        while tensor_data_offset % QK != 0:
+            fout.write(struct.pack("B", 0))
+            tensor_data_offset += 1
+
+        # output unified mappable tensor data
+        if n_dims == 1 or n_parts == 1:
+            # copy tensor which we thankfully received in one piece
+            if part_id == 0:
+                data.tofile(fout)
+        elif split_dim == 0:
+            # reassemble multifile tensor containing some of the rows
+            rows_per_chunk = partshape[0]
+            current_row = part_id * rows_per_chunk
+            bytes_per_row = fullshape[1] // blck_size * type_size
+            offset = current_row * bytes_per_row
+            fout.seek(tensor_data_offset + offset)
+            data.tofile(fout)
+        elif split_dim == 1:
+            # reassemble multifile tensor containing some of the cols
+            cols_per_chunk = partshape[1]
+            current_col = part_id * cols_per_chunk
+            bytes_per_row = fullshape[1] // blck_size * type_size
+            offset_current_col = current_col // blck_size * type_size
+            for row in range(partshape[0]):
+                offset_row = row * bytes_per_row
+                offset = offset_row + offset_current_col
+                fout.seek(tensor_data_offset + offset)
+                data[row].tofile(fout)
+
+        # advance file position to next tensor
+        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype_cur))

 def main():
-
    args = parse_args()
    dir_model = args.dir_model
    ftype = args.ftype
    ftype_str = ["f32", "f16"]
-
    hparams, tokenizer = load_hparams_and_tokenizer(dir_model)

    print(args)

    # if only writing vocab to file
    if args.vocab_only:
-
        fname_model = f"{dir_model}/consolidated.00.pth"
        fname_out = f"{dir_model}/ggml-vocab.bin"
-
        print(f"Extracting only the vocab from '{fname_model}'\n")
-
-        model = torch.load(fname_model, map_location="cpu")
-
        with open(fname_out, "wb") as fout:
            write_header(fout, hparams, ftype)
            write_tokens(fout, tokenizer)
-
-        del model
-
        print(f"Done. Output file: {fname_out}\n")
-
        return

    n_parts = get_n_parts(hparams["dim"])
+    fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin"

-    for p in range(n_parts):
+    # we output a single file for ggml
+    with open(fname_out, "wb") as fout:
+        write_header(fout, hparams, ftype)
+        write_tokens(fout, tokenizer)
+        offset_of_tensors = fout.tell()
+        # the tensors we load could be split across multiple files
+        for part_id in range(n_parts):
+            fout.seek(offset_of_tensors)
+            print(f"Processing part {part_id+1} of {n_parts}\n")
+            fname_model = f"{dir_model}/consolidated.0{part_id}.pth"
+            model = torch.load(fname_model, map_location="cpu")
+            process_and_write_variables(fout, model, ftype, part_id, n_parts)
+            del model

-        print(f"Processing part {p}\n")
-
-        fname_model = f"{dir_model}/consolidated.0{p}.pth"
-        fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
-
-        model = torch.load(fname_model, map_location="cpu")
-
-        with open(fname_out, "wb") as fout:
-            write_header(fout, hparams, ftype)
-            write_tokens(fout, tokenizer)
-            process_and_write_variables(fout, model, ftype)
-
-        del model
-
-        print(f"Done. Output file: {fname_out}, (part {p})\n")
+    print(f"Done. Output file: {fname_out}\n")

 if __name__ == "__main__":
    main()
--- a/convert-unversioned-ggml-to-ggml.py
+++ b/convert-unversioned-ggml-to-ggml.py
@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# Original by https://github.com/eiz
+# https://github.com/ggerganov/llama.cpp/issues/324#issuecomment-1476227818
+import argparse
+import glob
+import os
+import struct
+import sys
+from sentencepiece import SentencePieceProcessor
+
+HPARAMS = keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Upgrade old ggml model files to the current format')
+    parser.add_argument('dir_model', help='directory containing ggml .bin files')
+    parser.add_argument('tokenizer_model', help='path to LLaMA tokenizer.model file')
+    return parser.parse_args()
+
+def read_header(f_in):
+    struct_fmt = "i" * (3 + len(HPARAMS))
+    struct_size = struct.calcsize(struct_fmt)
+    buf = f_in.read(struct_size)
+    return struct.unpack(struct_fmt, buf)
+
+def write_header(f_out, header):
+    (magic, vocab_size, dim, multiple_of, n_heads, n_layers, rot, ftype) = header
+
+    if magic != 0x67676d6c:
+        raise Exception('Invalid file magic. Must be an old style ggml file.')
+
+    values = [
+        0x67676d66,  # magic: ggml in hex
+        1, # file version
+        vocab_size,
+        dim,
+        multiple_of,
+        n_heads,
+        n_layers,
+        rot,
+        ftype
+    ]
+    f_out.write(struct.pack("i" * len(values), *values))
+
+def write_tokens(fout, tokenizer):
+    for i in range(tokenizer.vocab_size()):
+        if tokenizer.is_unknown(i):
+            text = " \u2047 ".encode()
+        elif tokenizer.is_control(i):
+            text = b""
+        elif tokenizer.is_byte(i):
+            piece = tokenizer.id_to_piece(i)
+            if len(piece) != 6:
+                print(f"Invalid token: {piece}")
+                sys.exit(1)
+            byte_value = int(piece[3:-1], 16)
+            text = struct.pack("B", byte_value)
+        else:
+            text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode()
+        fout.write(struct.pack("i", len(text)))
+        fout.write(text)
+        fout.write(struct.pack("f", tokenizer.get_score(i)))
+
+def read_tokens(f_in, tokenizer):
+    for i in range(tokenizer.vocab_size()):
+        len_b = f_in.read(4)
+        (length,) = struct.unpack("i", len_b)
+        f_in.read(length)
+
+def copy_all_data(f_out, f_in):
+    while True:
+        buf = f_in.read(1024 * 1024)
+        if not buf:
+            break
+        f_out.write(buf)
+
+def convert_one_file(path_in, tokenizer):
+    path_tmp = f"{path_in}.tmp"
+    path_orig= f"{path_in}.orig"
+    print(f"converting {path_in}")
+    with open(path_in, "rb") as f_in, open(path_tmp, "wb") as f_out:
+        write_header(f_out, read_header(f_in))
+        read_tokens(f_in, tokenizer)
+        write_tokens(f_out, tokenizer)
+        copy_all_data(f_out, f_in)
+    os.rename(path_in, path_orig)
+    os.rename(path_tmp, path_in)
+
+def main():
+    args = parse_args()
+    files = []
+    files.extend(glob.glob(f"{args.dir_model}/*.bin"))
+    files.extend(glob.glob(f"{args.dir_model}/*.bin.*"))
+
+    tokenizer = SentencePieceProcessor(args.tokenizer_model)
+
+    for file in files:
+        convert_one_file(file, tokenizer)
+
+if __name__ == "__main__":
+    main()
--- a/examples/chat-13B.bat
+++ b/examples/chat-13B.bat
@ -0,0 +1,57 @@
+@setlocal disabledelayedexpansion enableextensions
+@echo off
+
+cd /d "%~dp0.."
+if not "%errorlevel%"=="0" (
+    echo Unable to change directory.
+    pause
+    exit /b 1
+)
+
+if not defined MODEL set "MODEL=models\13B\ggml-model-q4_0.bin"
+if not defined USER_NAME set "USER_NAME=User"
+if not defined AI_NAME set "AI_NAME=ChatLLaMa"
+rem Adjust to the number of CPU cores you want to use.
+rem if not defined N_THREAD set "N_THREAD=8"
+rem Number of tokens to predict (made it larger than default because we want a long interaction)
+if not defined N_PREDICTS set "N_PREDICTS=2048"
+if not defined GEN_OPTIONS set "GEN_OPTIONS=--ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647"
+
+rem Default main script paths
+set "DEFAULT_MAIN_SCRIPT_PATHS=main.exe build\bin\main.exe"
+
+rem Get main script path from command line arguments
+set "MAIN_SCRIPT_PATH=%~1"
+
+rem If the main script path was not specified, try the default paths
+if not defined MAIN_SCRIPT_PATH (
+    for %%i in (%DEFAULT_MAIN_SCRIPT_PATHS%) do (
+        if exist "%%i" set "MAIN_SCRIPT_PATH=%%i"
+    )
+)
+
+rem If the main script path was not found, tell the user how to specify it
+if not defined MAIN_SCRIPT_PATH (
+    echo The main script could not be found. Please provide the path to the main script as 1st argument to this script, or place the main script in one of the default locations:
+    echo %DEFAULT_MAIN_SCRIPT_PATHS%
+    pause
+    exit /b 1
+)
+
+rem Default context, feel free to edit it
+set "PROMPT_TEXT=Text transcript of a never ending dialog, where %USER_NAME% interacts with an AI assistant named %AI_NAME%. %AI_NAME% is helpful, kind, honest, friendly, good at writing and never fails to answer %USER_NAME%'s requests immediately and with details and precision. There are no annotations like (30 seconds passed...) or (to himself), just what %USER_NAME% and %AI_NAME% say aloud to each other. The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. The transcript only includes text, it does not include markup like HTML and Markdown."
+
+rem Set a temporary variable if N_THREAD is set
+if defined N_THREAD (
+    set "_N_THREAD=--threads %N_THREAD%"
+) else (
+    set "_N_THREAD="
+)
+
+rem Run the script
+echo "%MAIN_SCRIPT_PATH%" %GEN_OPTIONS% %_N_THREAD% ^
+  --model "%MODEL%" ^
+  --n_predict %N_PREDICTS% ^
+  --color --interactive ^
+  --reverse-prompt "%USER_NAME%:" ^
+  --prompt "%PROMPT_TEXT%"
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -9,11 +9,20 @@
 #include <iterator>
 #include <algorithm>

- #if defined(_MSC_VER) || defined(__MINGW32__)
- #include <malloc.h> // using malloc.h with MSC/MINGW
- #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
- #include <alloca.h>
- #endif
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <malloc.h> // using malloc.h with MSC/MINGW
+#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__)
+#include <alloca.h>
+#endif
+
+#if defined (_WIN32)
+#pragma comment(lib,"kernel32.lib")
+extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
+extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
+extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
+extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
+extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
+#endif

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    // determine sensible default number of threads.
@ -30,6 +39,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {

    bool invalid_param = false;
    std::string arg;
+    gpt_params default_params;
+
    for (int i = 1; i < argc; i++) {
        arg = argv[i];

@ -57,6 +68,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
@ -159,7 +175,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            }
            params.n_parts = std::stoi(argv[i]);
        } else if (arg == "-h" || arg == "--help") {
-            gpt_print_usage(argc, argv, params);
+            gpt_print_usage(argc, argv, default_params);
            exit(0);
        } else if (arg == "--random-prompt") {
            params.random_prompt = true;
@ -171,13 +187,13 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.input_prefix = argv[i];
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            gpt_print_usage(argc, argv, params);
+            gpt_print_usage(argc, argv, default_params);
            exit(1);
        }
    }
    if (invalid_param) {
        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
-        gpt_print_usage(argc, argv, params);
+        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }

@ -204,19 +220,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
    fprintf(stderr, "  -f FNAME, --file FNAME\n");
    fprintf(stderr, "                        prompt file to start generation.\n");
-    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 - infinity)\n", params.n_predict);
+    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
    fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
-    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
+    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", (double)params.top_p);
    fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
-    fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
+    fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty);
    fprintf(stderr, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  --ignore-eos          ignore end of stream token and continue generating\n");
    fprintf(stderr, "  --memory_f32          use f32 instead of f16 for memory key+value\n");
-    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
+    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", (double)params.temp);
    fprintf(stderr, "  --n_parts N           number of model parts (default: -1 = determine from dimensions)\n");
    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  --perplexity          compute perplexity over the prompt\n");
-    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt\n");
+    fprintf(stderr, "  --keep                number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
    if (ggml_mlock_supported()) {
        fprintf(stderr, "  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
    }
@ -256,3 +272,47 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s

    return res;
 }
+
+/* Keep track of current color of output, and emit ANSI code if it changes. */
+void set_console_color(console_state & con_st, console_color_t color) {
+    if (con_st.use_color && con_st.color != color) {
+        switch(color) {
+            case CONSOLE_COLOR_DEFAULT:
+                printf(ANSI_COLOR_RESET);
+                break;
+            case CONSOLE_COLOR_PROMPT:
+                printf(ANSI_COLOR_YELLOW);
+                break;
+            case CONSOLE_COLOR_USER_INPUT:
+                printf(ANSI_BOLD ANSI_COLOR_GREEN);
+                break;
+        }
+        con_st.color = color;
+    }
+}
+
+#if defined (_WIN32)
+void win32_console_init(bool enable_color) {
+    unsigned long dwMode = 0;
+    void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
+    if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) {
+        hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12)
+        if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) {
+            hConOut = 0;
+        }
+    }
+    if (hConOut) {
+        // Enable ANSI colors on Windows 10+
+        if (enable_color && !(dwMode & 0x4)) {
+            SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
+        }
+        // Set console output codepage to UTF8
+        SetConsoleOutputCP(65001); // CP_UTF8
+    }
+    void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
+    if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
+        // Set console input codepage to UTF8
+        SetConsoleCP(65001); // CP_UTF8
+    }
+}
+#endif
--- a/examples/common.h
+++ b/examples/common.h
@ -63,3 +63,33 @@ std::string gpt_random_prompt(std::mt19937 & rng);
 //

 std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos);
+
+//
+// Console utils
+//
+
+#define ANSI_COLOR_RED     "\x1b[31m"
+#define ANSI_COLOR_GREEN   "\x1b[32m"
+#define ANSI_COLOR_YELLOW  "\x1b[33m"
+#define ANSI_COLOR_BLUE    "\x1b[34m"
+#define ANSI_COLOR_MAGENTA "\x1b[35m"
+#define ANSI_COLOR_CYAN    "\x1b[36m"
+#define ANSI_COLOR_RESET   "\x1b[0m"
+#define ANSI_BOLD          "\x1b[1m"
+
+enum console_color_t {
+    CONSOLE_COLOR_DEFAULT=0,
+    CONSOLE_COLOR_PROMPT,
+    CONSOLE_COLOR_USER_INPUT
+};
+
+struct console_state {
+    bool use_color = false;
+    console_color_t color = CONSOLE_COLOR_DEFAULT;
+};
+
+void set_console_color(console_state & con_st, console_color_t color);
+
+#if defined (_WIN32)
+void win32_console_init(bool enable_color);
+#endif
--- a/examples/gpt4all.sh
+++ b/examples/gpt4all.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+#
+# Temporary script - will be removed in the future
+#
+
+cd `dirname $0`
+cd ..
+
+./main --color --instruct --threads 4 \
+       --model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
+       --file ./prompts/alpaca.txt \
+       --batch_size 8 --ctx_size 2048 \
+       --repeat_last_n 64 --repeat_penalty 1.3 \
+       --n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -18,67 +18,13 @@
 #include <signal.h>
 #endif

-#if defined (_WIN32)
-#pragma comment(lib,"kernel32.lib")
-extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
-extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
-extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
-#endif
-
-#define ANSI_COLOR_RED     "\x1b[31m"
-#define ANSI_COLOR_GREEN   "\x1b[32m"
-#define ANSI_COLOR_YELLOW  "\x1b[33m"
-#define ANSI_COLOR_BLUE    "\x1b[34m"
-#define ANSI_COLOR_MAGENTA "\x1b[35m"
-#define ANSI_COLOR_CYAN    "\x1b[36m"
-#define ANSI_COLOR_RESET   "\x1b[0m"
-#define ANSI_BOLD          "\x1b[1m"
-
-/* Keep track of current color of output, and emit ANSI code if it changes. */
-enum console_state {
-    CONSOLE_STATE_DEFAULT=0,
-    CONSOLE_STATE_PROMPT,
-    CONSOLE_STATE_USER_INPUT
-};
-
-static console_state con_st = CONSOLE_STATE_DEFAULT;
-static bool con_use_color = false;
-
-void enable_console_colors() {
-#if defined (_WIN32)
-    // Enable ANSI colors on Windows 10+
-    unsigned long dwMode = 0;
-    void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
-    if (hConOut && hConOut != (void*)-1 && GetConsoleMode(hConOut, &dwMode) && !(dwMode & 0x4)) {
-        SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
-    }
-#endif
-}
-
-void set_console_state(console_state new_st) {
-    if (!con_use_color) return;
-    // only emit color code if state changed
-    if (new_st != con_st) {
-        con_st = new_st;
-        switch(con_st) {
-        case CONSOLE_STATE_DEFAULT:
-            printf(ANSI_COLOR_RESET);
-            return;
-        case CONSOLE_STATE_PROMPT:
-            printf(ANSI_COLOR_YELLOW);
-            return;
-        case CONSOLE_STATE_USER_INPUT:
-            printf(ANSI_BOLD ANSI_COLOR_GREEN);
-            return;
-        }
-    }
-}
+static console_state con_st;

 static bool is_interacting = false;

 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 void sigint_handler(int signo) {
-    set_console_state(CONSOLE_STATE_DEFAULT);
+    set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
    printf("\n"); // this also force flush stdout.
    if (signo == SIGINT) {
        if (!is_interacting) {
@ -98,6 +44,14 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    // save choice to use color for later
+    // (note for later: this is a slightly awkward choice)
+    con_st.use_color = params.use_color;
+
+#if defined (_WIN32)
+    win32_console_init(params.use_color);
+#endif
+
    if (params.perplexity) {
        printf("\n************\n");
        printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__);
@ -130,10 +84,6 @@ int main(int argc, char ** argv) {
        params.prompt = gpt_random_prompt(rng);
    }

-    // save choice to use color for later
-    // (note for later: this is a slightly awkward choice)
-    con_use_color = params.use_color;
-
 //    params.prompt = R"(// this function checks if the number n is prime
 //bool is_prime(int n) {)";

@ -196,7 +146,10 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    params.n_keep    = std::min(params.n_keep,    (int) embd_inp.size());
+    // number of tokens to keep when resetting context
+    if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) {
+        params.n_keep = (int)embd_inp.size();
+    }

    // prefix & suffix for instruct mode
    const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true);
@ -204,16 +157,12 @@ int main(int argc, char ** argv) {

    // in instruct mode, we inject a prefix and a suffix to each input by the user
    if (params.instruct) {
-        params.interactive = true;
+        params.interactive_start = true;
        params.antiprompt.push_back("### Instruction:\n\n");
    }

-    // enable interactive mode if reverse prompt is specified
-    if (params.antiprompt.size() != 0) {
-        params.interactive = true;
-    }
-
-    if (params.interactive_start) {
+    // enable interactive mode if reverse prompt or interactive start is specified
+    if (params.antiprompt.size() != 0 || params.interactive_start) { 
        params.interactive = true;
    }

@ -260,7 +209,8 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
        }
    }
-    fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
+    fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n",
+        params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
    fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
    fprintf(stderr, "\n\n");

@ -275,20 +225,18 @@ int main(int argc, char ** argv) {
 #endif
               " - Press Return to return control to LLaMa.\n"
               " - If you want to submit another line, end your input in '\\'.\n\n");
-        is_interacting = params.interactive_start || params.instruct;
+        is_interacting = params.interactive_start;
    }

-    bool input_noecho = false;
+    bool is_antiprompt = false;
+    bool input_noecho  = false;

    int n_past     = 0;
    int n_remain   = params.n_predict;
    int n_consumed = 0;

    // the first thing we will do is to output the prompt, so set color accordingly
-    if (params.use_color) {
-        enable_console_colors();
-    }
-    set_console_state(CONSOLE_STATE_PROMPT);
+    set_console_color(con_st, CONSOLE_COLOR_PROMPT);

    std::vector<llama_token> embd;

@ -327,10 +275,10 @@ int main(int argc, char ** argv) {

        if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
            // out of user input, sample next token
-            const float top_k          = params.top_k;
-            const float top_p          = params.top_p;
-            const float temp           = params.temp;
-            const float repeat_penalty = params.repeat_penalty;
+            const int32_t top_k          = params.top_k;
+            const float   top_p          = params.top_p;
+            const float   temp           = params.temp;
+            const float   repeat_penalty = params.repeat_penalty;

            llama_token id = 0;

@ -389,36 +337,38 @@ int main(int argc, char ** argv) {
        }
        // reset color to default if we there is no pending user input
        if (!input_noecho && (int)embd_inp.size() == n_consumed) {
-            set_console_state(CONSOLE_STATE_DEFAULT);
+            set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
        }

        // in interactive mode, and not currently processing queued inputs;
        // check if we should prompt the user for more
        if (params.interactive && (int) embd_inp.size() <= n_consumed) {
-            // check for reverse prompt
-            std::string last_output;
-            for (auto id : last_n_tokens) {
-                last_output += llama_token_to_str(ctx, id);
-            }

-            // Check if each of the reverse prompts appears at the end of the output.
-            for (std::string & antiprompt : params.antiprompt) {
-                if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
-                    is_interacting = true;
-                    set_console_state(CONSOLE_STATE_USER_INPUT);
-                    fflush(stdout);
-                    break;
+            // check for reverse prompt
+            if (params.antiprompt.size()) {
+                std::string last_output;
+                for (auto id : last_n_tokens) {
+                    last_output += llama_token_to_str(ctx, id);
+                }
+
+                is_antiprompt = false;
+                // Check if each of the reverse prompts appears at the end of the output.
+                for (std::string & antiprompt : params.antiprompt) {
+                    if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
+                        is_interacting = true;
+                        is_antiprompt = true;
+                        set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
+                        fflush(stdout);
+                        break;
+                    }
                }
            }

            if (n_past > 0 && is_interacting) {
                // potentially set color to indicate we are taking user input
-                set_console_state(CONSOLE_STATE_USER_INPUT);
+                set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);

                if (params.instruct) {
-                    n_consumed = embd_inp.size();
-                    embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
-
                    printf("\n> ");
                }

@ -431,7 +381,10 @@ int main(int argc, char ** argv) {
                std::string line;
                bool another_line = true;
                do {
-                    std::getline(std::cin, line);
+                    if (!std::getline(std::cin, line)) {
+                        // input stream is bad or EOF received
+                        return 0;
+                    }
                    if (line.empty() || line.back() != '\\') {
                        another_line = false;
                    } else {
@ -441,17 +394,29 @@ int main(int argc, char ** argv) {
                } while (another_line);

                // done taking input, reset color
-                set_console_state(CONSOLE_STATE_DEFAULT);
+                set_console_color(con_st, CONSOLE_COLOR_DEFAULT);

-                auto line_inp = ::llama_tokenize(ctx, buffer, false);
-                embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+                // Add tokens to embd only if the input buffer is non-empty
+                // Entering a empty line lets the user pass control back
+                if (buffer.length() > 1) {

-                if (params.instruct) {
-                    embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+                    // instruct mode: insert instruction prefix
+                    if (params.instruct && !is_antiprompt) {
+                        n_consumed = embd_inp.size();
+                        embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
+                    }
+
+                    auto line_inp = ::llama_tokenize(ctx, buffer, false);
+                    embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
+
+                    // instruct mode: insert response suffix
+                    if (params.instruct) {
+                        embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
+                    }
+
+                    n_remain -= line_inp.size();
                }

-                n_remain -= line_inp.size();
-
                input_noecho = true; // do not echo this again
            }

@ -471,7 +436,7 @@ int main(int argc, char ** argv) {
        }

        // In interactive mode, respect the maximum number of tokens and drop back to user input when reached.
-        if (params.interactive && n_remain <= 0) {
+        if (params.interactive && n_remain <= 0 && params.n_predict != -1) {
            n_remain = params.n_predict;
            is_interacting = true;
        }
@ -484,7 +449,7 @@ int main(int argc, char ** argv) {
    llama_print_timings(ctx);
    llama_free(ctx);

-    set_console_state(CONSOLE_STATE_DEFAULT);
+    set_console_color(con_st, CONSOLE_COLOR_DEFAULT);

    return 0;
 }
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -1,15 +1,17 @@
 #include "common.h"
 #include "llama.h"

-std::vector<double> softmax(const std::vector<float>& logits) {
-    std::vector<double> probs(logits.size());
+#include <cmath>
+
+std::vector<float> softmax(const std::vector<float>& logits) {
+    std::vector<float> probs(logits.size());
    float max_logit = logits[0];
    for (float v : logits) max_logit = std::max(max_logit, v);
    double sum_exp = 0.0;
    for (size_t i = 0; i < logits.size(); i++) {
        // Subtract the maximum logit value from the current logit value for numerical stability
-        float logit = logits[i] - max_logit;
-        double exp_logit = std::exp(logit);
+        const float logit = logits[i] - max_logit;
+        const float exp_logit = expf(logit);
        sum_exp += exp_logit;
        probs[i] = exp_logit;
    }
@ -19,15 +21,15 @@ std::vector<double> softmax(const std::vector<float>& logits) {

 void perplexity(llama_context * ctx, const gpt_params & params) {
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
-    // Run `./main --perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
+    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
    // Output: `perplexity: 13.5106 [114/114]`
    auto tokens = ::llama_tokenize(ctx, params.prompt, true);

    int count = 0;
-    double nll = 0.0;
    int seq_count = tokens.size() / params.n_ctx;
    int n_vocab = llama_n_vocab(ctx);

+    double nll = 0.0;
    fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch);

    for (int i = 0; i < seq_count; ++i) {
@ -49,7 +51,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
        }
        auto end_t = std::chrono::high_resolution_clock::now();
        if (i == 0) {
-            double seconds = std::chrono::duration<double>(end_t - start_t).count();
+            const float seconds = std::chrono::duration<float>(end_t - start_t).count();
            printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0));
        }
        // We get the logits for all the tokens in the context window (params.n_ctx)
@ -70,7 +72,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
            std::vector<float> tok_logits(
                logits.begin() + j * n_vocab,
                logits.begin() + (j + 1) * n_vocab);
-            double prob = softmax(tok_logits)[tokens[start + j + 1]];
+            float prob = softmax(tok_logits)[tokens[start + j + 1]];
            nll += -std::log(prob);
            ++count;
        }
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -4,8 +4,6 @@
 #include <cstdio>
 #include <string>

-const int QK = 32;
-
 // usage:
 //  ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
 //
@ -21,7 +19,7 @@ int main(int argc, char ** argv) {

    // needed to initialize f16 tables
    {
-        struct ggml_init_params params = { 0, NULL };
+        struct ggml_init_params params = { 0, NULL, false };
        struct ggml_context * ctx = ggml_init(params);
        ggml_free(ctx);
    }
@ -39,7 +37,7 @@ int main(int argc, char ** argv) {
    {
        const int64_t t_start_us = ggml_time_us();

-        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) {
+        if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }
@ -52,8 +50,8 @@ int main(int argc, char ** argv) {
        const int64_t t_main_end_us = ggml_time_us();

        printf("\n");
-        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
-        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0);
    }

    return 0;
--- a/examples/reason-act.sh
+++ b/examples/reason-act.sh
@ -0,0 +1,17 @@
+
+#!/bin/bash
+
+cd `dirname $0`
+cd ..
+
+# get -m model parameter otherwise defer to default
+if [ "$1" == "-m" ]; then
+  MODEL="-m $2 "
+fi
+
+./main $MODEL --color \
+    -f ./prompts/reason-act.txt \
+    -i --interactive-first \
+    --top_k 10000 --temp 0.2 --repeat_penalty 1 -t 7 -c 2048 \
+    -r "Question:" -r "Observation:" --in-prefix " " \
+    -n -1
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -258,11 +258,11 @@ struct ggml_tensor {
    enum ggml_type type;

    int    n_dims;
-    int    ne[GGML_MAX_DIMS]; // number of elements
-    size_t nb[GGML_MAX_DIMS]; // stride in bytes:
-                              // nb[0] = sizeof(type)
-                              // nb[1] = nb[0]   * ne[0] + padding
-                              // nb[i] = nb[i-1] * ne[i-1]
+    int64_t ne[GGML_MAX_DIMS]; // number of elements
+    size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
+                               // nb[0] = sizeof(type)
+                               // nb[1] = nb[0]   * ne[0] + padding
+                               // nb[i] = nb[i-1] * ne[i-1]

    // compute data
    enum ggml_op op;
@ -316,6 +316,7 @@ struct ggml_init_params {
    // memory pool
    size_t mem_size;   // bytes
    void * mem_buffer; // if NULL, memory will be allocated internally
+    bool   no_alloc;   // don't allocate memory for the tensor data
 };

 void    ggml_time_init(void); // call this once at the beginning of the program
@ -327,8 +328,8 @@ int64_t ggml_cycles_per_ms(void);
 void ggml_print_object (const struct ggml_object * obj);
 void ggml_print_objects(const struct ggml_context * ctx);

-int    ggml_nelements(const struct ggml_tensor * tensor);
-size_t ggml_nbytes   (const struct ggml_tensor * tensor);
+int64_t ggml_nelements(const struct ggml_tensor * tensor);
+size_t  ggml_nbytes   (const struct ggml_tensor * tensor);

 int    ggml_blck_size (enum ggml_type type);
 size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
@ -344,39 +345,43 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);

 bool ggml_mlock_supported(void);
-bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
+bool ggml_mlock(
+        struct ggml_context * ctx,
+        const void *opt_extra_addr,
+        size_t opt_extra_len,
+        char **err_p);

 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    n_dims,
-        const int *ne);
+        const int64_t *ne);

 struct ggml_tensor * ggml_new_tensor_1d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0);
+        int64_t ne0);

 struct ggml_tensor * ggml_new_tensor_2d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0,
-        int    ne1);
+        int64_t ne0,
+        int64_t ne1);

 struct ggml_tensor * ggml_new_tensor_3d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0,
-        int    ne1,
-        int    ne2);
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2);

 struct ggml_tensor * ggml_new_tensor_4d(
        struct ggml_context * ctx,
        enum   ggml_type type,
-        int    ne0,
-        int    ne1,
-        int    ne2,
-        int    ne3);
+        int64_t ne0,
+        int64_t ne1,
+        int64_t ne2,
+        int64_t ne3);

 struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
 struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
@ -526,30 +531,30 @@ struct ggml_tensor * ggml_reshape(
 struct ggml_tensor * ggml_reshape_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1);
+        int64_t               ne0,
+        int64_t               ne1);

 // return view(a)
 // TODO: when we start computing gradient, make a copy instead of view
 struct ggml_tensor * ggml_reshape_3d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1,
-        int                   ne2);
+        int64_t               ne0,
+        int64_t               ne1,
+        int64_t               ne2);

 // offset in bytes
 struct ggml_tensor * ggml_view_1d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
+        int64_t               ne0,
        size_t                offset);

 struct ggml_tensor * ggml_view_2d(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
-        int                   ne0,
-        int                   ne1,
+        int64_t               ne0,
+        int64_t               ne1,
        size_t                nb1, // row stride in bytes
        size_t                offset);

@ -748,8 +753,8 @@ enum ggml_opt_result ggml_opt(
 // quantization
 //

-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);

 //
 // system info
--- a/llama.cpp
+++ b/llama.cpp
@ -12,6 +12,19 @@
 #include <cassert>
 #include <cstring>

+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#else
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+#endif
+
+#define Min(X, Y) ((Y) > (X) ? (X) : (Y))
+#define Max(X, Y) ((Y) < (X) ? (X) : (Y))
+
 #define LLAMA_USE_SCRATCH
 #define LLAMA_MAX_SCRATCH_BUFFERS 16

@ -142,6 +155,10 @@ struct llama_model {
    // the model memory buffer
    std::vector<uint8_t> buf;

+    // model memory mapped file
+    void * mm_addr = NULL;
+    uint64_t mm_length = 0;
+
    // tensors
    int n_loaded;
    std::unordered_map<std::string, struct ggml_tensor *> tensors;
@ -165,6 +182,7 @@ struct llama_context {

    int64_t t_load_us = 0;
    int64_t t_start_us = 0;
+    bool has_evaluated_once = false;

    int64_t t_sample_us = 0;
    int64_t t_eval_us   = 0;
@ -206,7 +224,7 @@ struct llama_context {
        }

        if (buf_last >= 0) {
-            buf_max_size[buf_last] = std::max(buf_max_size[buf_last], last_size);
+            buf_max_size[buf_last] = Max(buf_max_size[buf_last], last_size);
        }

        buf_last = i;
@ -238,14 +256,15 @@ static bool kv_cache_init(
    const int n_embd  = hparams.n_embd;
    const int n_layer = hparams.n_layer;

-    const int n_mem      = n_layer*n_ctx;
-    const int n_elements = n_embd*n_mem;
+    const int64_t n_mem      = (int64_t)n_layer*n_ctx;
+    const int64_t n_elements = n_embd*n_mem;

    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);

    struct ggml_init_params params;
    params.mem_size   = cache.buf.size();
    params.mem_buffer = cache.buf.data();
+    params.no_alloc   = false;

    cache.ctx = ggml_init(params);

@ -288,6 +307,58 @@ struct llama_context_params llama_context_default_params() {
 // model loading
 //

+static void *mmap_file(const char *fname, uint64_t *mm_length) {
+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+    HANDLE hFile = CreateFileA(fname,
+                               GENERIC_READ,
+                               FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                               NULL,
+                               OPEN_EXISTING,
+                               FILE_ATTRIBUTE_NORMAL | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED,
+                               NULL);
+    if (hFile == INVALID_HANDLE_VALUE) return 0;
+    LARGE_INTEGER fileSize;
+    fileSize.QuadPart = -1;
+    GetFileSizeEx(hFile, &fileSize);
+    int64_t length = fileSize.QuadPart;
+    HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
+    CloseHandle(hFile);
+    if (!hMapping) return 0;
+    void *addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+    CloseHandle(hMapping);
+    if (!addr) return 0;
+#else
+    int fd = open(fname, O_RDONLY);
+    if (fd == -1) return 0;
+    int64_t length = lseek(fd, 0, SEEK_END);
+    void *addr = mmap(NULL, length, PROT_READ, MAP_SHARED, fd, 0);
+    close(fd);
+    if (addr == MAP_FAILED) return 0;
+#endif
+    *mm_length = length;
+    return addr;
+}
+
+static void munmap_file(void * addr, size_t length) {
+#if defined(_WIN32) && !defined(_POSIX_MAPPED_FILES)
+    UnmapViewOfFile(addr);
+#else
+    munmap(addr, length);
+#endif
+}
+
+static bool report_bad_magic(const char *path, uint32_t got, uint32_t want) {
+    fprintf(stderr,
+            "%s: invalid model file (bad magic [got %#x want %#x])\n"
+            "\tyou most likely need to regenerate your ggml files\n"
+            "\tthe benefit is you'll get 10-100x faster load times\n"
+            "\tsee https://github.com/ggerganov/llama.cpp/issues/91\n"
+            "\tuse convert-pth-to-ggml.py to regenerate from original pth\n"
+            "\tuse migrate-ggml-2023-03-30-pr613.py if you deleted originals\n",
+            path, got, want);
+    return false;
+}
+
 static bool llama_model_load(
        const std::string & fname,
        llama_context & lctx,
@ -299,34 +370,35 @@ static bool llama_model_load(
        void *progress_callback_user_data) {
    fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

-    const int64_t t_start_us = ggml_time_us();
-
-    lctx.t_start_us = t_start_us;
-
-    std::vector<char> f_buf(1024*1024);
+    lctx.t_start_us = ggml_time_us();

    auto & model = lctx.model;
    auto & vocab = lctx.vocab;

    auto fin = std::ifstream(fname, std::ios::binary);
-    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
    if (!fin) {
        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
        return false;
    }

+    std::vector<char> f_buf(1024*1024);
+    fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
+
+    fin.seekg(0, fin.end);
+    const size_t file_size = fin.tellg();
+    fin.seekg(0);
+
    // verify magic
    {
        uint32_t magic;
        fin.read((char *) &magic, sizeof(magic));
        if (magic == LLAMA_FILE_MAGIC_UNVERSIONED) {
-            fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
+            fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files or convert them with convert-unversioned-ggml-to-ggml.py!)\n",
                    __func__, fname.c_str());
            return false;
        }
        if (magic != LLAMA_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
+            return report_bad_magic(fname.c_str(), magic, LLAMA_FILE_MAGIC);
        }

        uint32_t format_version;
@ -449,43 +521,24 @@ static bool llama_model_load(
                }
    }

+    // map model into memory
+    char *mm_addr = NULL;
+    model.mm_addr = mmap_file(fname.c_str(), &model.mm_length);
+    if (model.mm_addr == NULL) {
+        fprintf(stderr, "%s: failed to mmap '%s'\n", __func__, fname.c_str());
+        return false;
+    }
+    mm_addr = (char *)model.mm_addr;
+    fprintf(stderr, "%s: ggml map size = %6.2f MB\n", __func__, model.mm_length/(1024.0*1024.0));
+
    auto & ctx = model.ctx;

    size_t ctx_size = 0;
-
    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
+        const auto &hparams = model.hparams;
        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // tok_embeddings
-
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
-
-        ctx_size += n_embd*n_vocab*ggml_type_sizef(vtype); // output
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
-
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
-
-        ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
-        ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
-        ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(memory_type); // memory_v
-
        ctx_size += (5 + 10*n_layer)*256; // object overhead
-
-        fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+        fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
    }

    // print memory requirements
@ -495,6 +548,7 @@ static bool llama_model_load(
        // this is the total memory required to run the inference
        const size_t mem_required =
            ctx_size +
+            model.mm_length +
            MEM_REQ_SCRATCH0.at(model.type) +
            MEM_REQ_SCRATCH1.at(model.type) +
            MEM_REQ_EVAL.at    (model.type);
@ -514,6 +568,7 @@ static bool llama_model_load(
        struct ggml_init_params params = {
            /*.mem_size   =*/ lctx.model.buf.size(),
            /*.mem_buffer =*/ lctx.model.buf.data(),
+            /*.no_alloc   =*/ true,
        };

        model.ctx = ggml_init(params);
@ -576,234 +631,106 @@ static bool llama_model_load(
        }
    }

-    const size_t file_offset = fin.tellg();
-
-    fin.close();
-
    std::vector<uint8_t> tmp;

    if (progress_callback) {
        progress_callback(0.0, progress_callback_user_data);
    }

-    for (int i = 0; i < n_parts; ++i) {
-        const int part_id = i;
-        //const int part_id = n_parts - i - 1;
+    fprintf(stderr, "%s: loading tensors from '%s'\n", __func__, fname.c_str());

-        std::string fname_part = fname;
-        if (i > 0) {
-            fname_part += "." + std::to_string(i);
-        }
+    // load weights
+    {
+        size_t total_size = 0;
+        model.n_loaded = 0;

-        fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
+        while (true) {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ftype;

-        fin = std::ifstream(fname_part, std::ios::binary);
-        fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));

-        fin.seekg(0, fin.end);
-        const size_t file_size = fin.tellg();
-
-        fin.seekg(file_offset);
-
-        // load weights
-        {
-            size_t total_size = 0;
-
-            model.n_loaded = 0;
-
-            fprintf(stderr, "%s: ", __func__);
-
-            while (true) {
-                int32_t n_dims;
-                int32_t length;
-                int32_t ftype;
-
-                fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-                fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-                fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
-
-                if (fin.eof()) {
-                    break;
-                }
-
-                int32_t nelements = 1;
-                int32_t ne[2] = { 1, 1 };
-                for (int i = 0; i < n_dims; ++i) {
-                    fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                    nelements *= ne[i];
-                }
-
-                std::string name(length, 0);
-                fin.read(&name[0], length);
-
-                if (model.tensors.find(name.data()) == model.tensors.end()) {
-                    fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                    return false;
-                }
-
-                // split_type = 0: split by columns
-                // split_type = 1: split by rows
-                int split_type = 0;
-
-                // split_type = 0:
-                // regex:
-                //   - tok_embeddings.*
-                //   - layers.*.attention.wo.weight
-                //   - layers.*.feed_forward.w2.weight
-
-                // split_type = 1:
-                // regex:
-                //   - output.*
-                //   - layers.*.attention.wq.weight
-                //   - layers.*.attention.wk.weight
-                //   - layers.*.attention.wv.weight
-                //   - layers.*.feed_forward.w1.weight
-                //   - layers.*.feed_forward.w3.weight
-                if (name.find("tok_embeddings") != std::string::npos) {
-                    split_type = 0;
-                } else if (name.find("layers") != std::string::npos) {
-                    if (name.find("attention.wo.weight") != std::string::npos) {
-                        split_type = 0;
-                    } else if (name.find("feed_forward.w2.weight") != std::string::npos) {
-                        split_type = 0;
-                    } else {
-                        split_type = 1;
-                    }
-                } else if (name.find("output") != std::string::npos) {
-                    split_type = 1;
-                }
-
-                auto tensor = model.tensors[name.data()];
-
-                if (n_dims == 1) {
-                    if (ggml_nelements(tensor) != nelements) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                        return false;
-                    }
-                } else {
-                    if (ggml_nelements(tensor)/n_parts != nelements) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                        return false;
-                    }
-                }
-
-                if (n_dims == 1) {
-                    if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                                __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
-                        return false;
-                    }
-                } else {
-                    if (split_type == 0) {
-                        if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
-                            fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                                    __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
-                            return false;
-                        }
-                    } else {
-                        if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
-                            fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                                    __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
-                            return false;
-                        }
-                    }
-                }
-
-                if (0) {
-                    static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
-                    fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
-                }
-
-                size_t bpe = 0;
-
-                switch (ftype) {
-                    case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
-                    case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
-                    case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
-                    case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
-                    default:
-                            {
-                                fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
-                                return false;
-                            }
-                };
-
-                if (n_dims == 1 || n_parts == 1) {
-                    if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                                __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                        return false;
-                    }
-
-                    if (part_id == 0) {
-                        fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-                    } else {
-                        fin.seekg(ggml_nbytes(tensor), std::ios::cur);
-                    }
-
-                    total_size += ggml_nbytes(tensor);
-                } else {
-                    if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
-                        fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                                __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
-                        return false;
-                    }
-
-                    if (split_type == 0) {
-                        const int np0 = ne[0];
-
-                        const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
-                        assert(row_size == tensor->nb[1]);
-
-                        for (int i1 = 0; i1 < ne[1]; ++i1) {
-                            const size_t offset_row = i1*row_size;
-                            const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
-                            fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
-                        }
-                    } else {
-                        const int np1 = ne[1];
-
-                        const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
-
-                        for (int i1 = 0; i1 < ne[1]; ++i1) {
-                            const size_t offset_row = (i1 + part_id*np1)*row_size;
-                            fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
-                        }
-                    }
-
-                    total_size += ggml_nbytes(tensor)/n_parts;
-                }
-
-                //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
-                model.n_loaded++;
-
-                // progress
-                if (progress_callback) {
-                    double current_file_progress = double(size_t(fin.tellg()) - file_offset) / double(file_size - file_offset);
-                    double current_progress = (double(i) + current_file_progress) / double(n_parts);
-                    progress_callback(current_progress, progress_callback_user_data);
-                }
-                if (model.n_loaded % 8 == 0) {
-                    fprintf(stderr, ".");
-                    fflush(stderr);
-                }
+            if (fin.eof()) {
+                break;
            }

-            fprintf(stderr, " done\n");
+            int32_t nelements = 1;
+            int32_t ne[2] = { 1, 1 };
+            for (int i = 0; i < n_dims; ++i) {
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+                nelements *= ne[i];
+            }

-            fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
-            if (model.n_loaded == 0) {
-                fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
-            } else if (model.n_loaded != (int) model.tensors.size()) {
-                fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
                return false;
            }
+
+            auto tensor = model.tensors[name.data()];
+
+            if (ggml_nelements(tensor) != nelements) {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                return false;
+            }
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
+                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                return false;
+            }
+            if (0) {
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                fprintf(stderr, "%24s - [%5d, %5d], type = %6s\n", name.data(), ne[0], ne[1], ftype_str[ftype]);
+            }
+
+            switch (ftype) {
+                case 0:  // f32
+                case 1:  // f16
+                    break;
+                case 2:  // q4_0
+                case 3:  // q4_1
+                    assert(ne[0] % 64 == 0);
+                    break;
+                default:
+                    fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
+                    return false;
+            };
+
+            // load the tensor data into memory without copying or reading it
+            size_t offset = fin.tellg();
+            size_t tensor_data_size = ggml_nbytes(tensor);
+            offset = (offset + 31) & -32;
+            tensor->data = mm_addr + offset;
+            fin.seekg(offset + tensor_data_size);
+            total_size += tensor_data_size;
+            model.n_loaded++;
+
+            // progress
+            if (progress_callback) {
+                double current_progress = size_t(fin.tellg()) / double(file_size);
+                progress_callback(current_progress, progress_callback_user_data);
+            }
        }

        fin.close();
+
+        fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded);
+        if (model.n_loaded == 0) {
+            fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
+        } else if (model.n_loaded != (int) model.tensors.size()) {
+            fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
+            return false;
+        }
    }

-    lctx.t_load_us = ggml_time_us() - t_start_us;
+    // loading time will be recalculate after the first eval, so
+    // we take page faults deferred by mmap() into consideration
+    lctx.t_load_us = ggml_time_us() - lctx.t_start_us;

    if (progress_callback) {
        progress_callback(1.0, progress_callback_user_data);
@ -849,6 +776,7 @@ static bool llama_eval_internal(
    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_compute.size(),
        /*.mem_buffer =*/ buf_compute.data(),
+        /*.no_alloc   =*/ false,
    };

    struct ggml_context * ctx0 = ggml_init(params);
@ -856,7 +784,7 @@ static bool llama_eval_internal(
    // for big prompts, if BLAS is enabled, it is better to use only one thread
    // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
    ggml_cgraph gf = {};
-    gf.n_threads = N > 255 && ggml_cpu_has_blas() ? 1 : n_threads;
+    gf.n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, tokens, N*ggml_element_size(embd));
@ -922,7 +850,7 @@ static bool llama_eval_internal(
            struct ggml_tensor * KQ_scaled =
                ggml_scale(ctx0,
                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)));
+                        ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)));

            // KQ_masked = mask_past(KQ_scaled)
            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
@ -1126,7 +1054,7 @@ struct llama_tokenizer {
        size_t offs = 0;
        while (offs < text.size()) {
            llama_sp_symbol sym;
-            size_t char_len = std::min(text.size() - offs, utf8_len(text[offs]));
+            size_t char_len = Min(text.size() - offs, utf8_len(text[offs]));
            sym.text = text.c_str() + offs;
            sym.n = char_len;
            offs += char_len;
@ -1240,12 +1168,12 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
 // sampling
 //

-static void sample_top_k(std::vector<std::pair<double, llama_vocab::id>> & logits_id, int top_k) {
+static void sample_top_k(std::vector<std::pair<float, llama_vocab::id>> & logits_id, int top_k) {
    // find the top k tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, llama_vocab::id> & a, const std::pair<double, llama_vocab::id> & b) {
+            [](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
        return a.first > b.first;
    });

@ -1256,9 +1184,9 @@ static llama_vocab::id llama_sample_top_p_top_k(
        llama_context & lctx,
        const std::vector<llama_vocab::id> & last_n_tokens,
        int top_k,
-        double top_p,
-        double temp,
-        double repeat_penalty) {
+        float top_p,
+        float temp,
+        float repeat_penalty) {
    auto & rng = lctx.rng;

    const int n_logits = lctx.model.hparams.n_vocab;
@ -1266,17 +1194,17 @@ static llama_vocab::id llama_sample_top_p_top_k(
    const auto & logits = lctx.logits;
    const auto * plogits = logits.data() + logits.size() - n_logits;

-    std::vector<std::pair<double, llama_vocab::id>> logits_id;
+    std::vector<std::pair<float, llama_vocab::id>> logits_id;
    logits_id.reserve(n_logits);

    {
-        const double scale = 1.0/temp;
+        const float scale = 1.0f/temp;
        for (int i = 0; i < n_logits; ++i) {
            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if (plogits[i] < 0.0) {
+                if (plogits[i] < 0.0f) {
                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
                } else {
                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
@ -1289,18 +1217,18 @@ static llama_vocab::id llama_sample_top_p_top_k(

    sample_top_k(logits_id, top_k);

-    double maxl = -std::numeric_limits<double>::infinity();
+    float maxl = -std::numeric_limits<float>::infinity();
    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
+        maxl = Max(maxl, kv.first);
    }

    // compute probs for the top k tokens
-    std::vector<double> probs;
+    std::vector<float> probs;
    probs.reserve(logits_id.size());

    double sum = 0.0;
    for (const auto & kv : logits_id) {
-        double p = exp(kv.first - maxl);
+        const float p = expf(kv.first - maxl);
        probs.push_back(p);
        sum += p;
    }
@ -1310,8 +1238,8 @@ static llama_vocab::id llama_sample_top_p_top_k(
        p /= sum;
    }

-    if (top_p < 1.0f) {
-        double cumsum = 0.0f;
+    if (top_p < 1.0) {
+        double cumsum = 0.0;
        for (int i = 0; i < (int) probs.size(); i++) {
            cumsum += probs[i];
            if (cumsum >= top_p) {
@ -1345,7 +1273,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
 //

 // TODO: reuse code from the llama_model_load() somehow
-bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype, int qk) {
+static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
    ggml_type type = GGML_TYPE_Q4_1;

    switch (itype) {
@ -1385,8 +1313,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
            return false;
        }
        if (magic != LLAMA_FILE_MAGIC) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
-            return false;
+            return report_bad_magic(fname_inp.c_str(), magic, LLAMA_FILE_MAGIC);
        }

        fout.write((char *) &magic, sizeof(magic));
@ -1444,7 +1371,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
            return false;
        }

-        std::string word;
+        std::vector<char> word(32);
        vocab.id_to_token.resize(n_vocab);
        for (int i = 0; i < n_vocab; i++) {
            uint32_t len;
@ -1452,17 +1379,17 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
            fout.write((char *) &len, sizeof(len));

            word.resize(len);
-            finp.read ((char *) word.data(), len);
-            fout.write((char *) word.data(), len);
+            finp.read ((char *) &word[0], len);
+            fout.write((char *) &word[0], len);

            float score;
            finp.read ((char *) &score, sizeof(score));
            fout.write((char *) &score, sizeof(score));

-            vocab.token_to_id[word] = i;
+            vocab.token_to_id[word.data()] = i;

            auto &tok_score = vocab.id_to_token[i];
-            tok_score.tok = word;
+            tok_score.tok = word.data();
            tok_score.score = score;
        }
    }
@ -1503,6 +1430,13 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
            std::string name(length, 0);
            finp.read (&name[0], length);

+            {
+                // ensure tensor data is aligned
+                uint64_t offset = finp.tellg();
+                offset = (offset + 31) & -32;
+                finp.seekg(offset);
+            }
+
            {
                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
                printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
@ -1558,6 +1492,13 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
            }
            fout.write(&name[0], length);

+            {
+                // ensure tensor data is aligned
+                uint64_t offset = fout.tellp();
+                offset = (offset + 31) & -32;
+                fout.seekp(offset);
+            }
+
            if (quantize) {
                printf("quantizing .. ");
                work.resize(nelements); // for quantization
@ -1568,11 +1509,11 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
                switch (type) {
                    case GGML_TYPE_Q4_0:
                        {
-                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], qk, hist_cur.data());
+                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                        } break;
                    case GGML_TYPE_Q4_1:
                        {
-                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], qk, hist_cur.data());
+                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                        } break;
                    default:
                        {
@ -1590,7 +1531,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str
                }

                for (int i = 0; i < (int) hist_cur.size(); ++i) {
-                    printf("%5.3f ", hist_cur[i] / (float)nelements);
+                    printf("%5.3f ", hist_cur[i] / float(nelements));
                }
                printf("\n");
            } else {
@ -1613,7 +1554,7 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str

            printf("%s: hist: ", __func__);
            for (int i = 0; i < (int) hist_all.size(); ++i) {
-                printf("%5.3f ", hist_all[i] / (float)sum_all);
+                printf("%5.3f ", hist_all[i] / float(sum_all));
            }
            printf("\n");
        }
@ -1655,7 +1596,10 @@ struct llama_context * llama_init_from_file(

    if (params.use_mlock) {
        char *err;
-        if (!ggml_mlock(ctx->model.ctx, &err)) {
+        if (!ggml_mlock(ctx->model.ctx,
+                        ctx->model.mm_addr,
+                        ctx->model.mm_length,
+                        &err)) {
            fprintf(stderr, "%s\n", err);
            free(err);
            llama_free(ctx);
@ -1664,7 +1608,7 @@ struct llama_context * llama_init_from_file(
    }

    // reserve memory for context buffers
-    {
+    if (!params.vocab_only) {
        if (!kv_cache_init(ctx->model.hparams, ctx->model.kv_self, memory_type, ctx->model.hparams.n_ctx)) {
            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
            llama_free(ctx);
@ -1705,15 +1649,18 @@ void llama_free(struct llama_context * ctx) {
        ggml_free(ctx->model.ctx);
    }

+    if (ctx->model.mm_addr) {
+        munmap_file(ctx->model.mm_addr, ctx->model.mm_length);
+    }
+
    delete ctx;
 }

 int llama_model_quantize(
        const char * fname_inp,
        const char * fname_out,
-               int   itype,
-               int   qk) {
-    if (!llama_model_quantize_internal(fname_inp, fname_out, itype, qk)) {
+               int   itype) {
+    if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) {
        fprintf(stderr, "%s: failed to quantize\n", __func__);
        return 1;
    }
@ -1721,6 +1668,33 @@ int llama_model_quantize(
    return 0;
 }

+// Returns the KV cache that will contain the context for the
+// ongoing prediction with the model.
+const uint8_t * llama_get_kv_cache(struct llama_context * ctx) {
+    return ctx->model.kv_self.buf.data();
+}
+
+// Returns the size of the KV cache
+size_t llama_get_kv_cache_size(struct llama_context * ctx) {
+    return ctx->model.kv_self.buf.size();
+}
+
+int llama_get_kv_cache_token_count(struct llama_context * ctx) {
+    return ctx->model.kv_self.n;
+}
+
+// Sets the KV cache containing the current context for the model
+void llama_set_kv_cache(
+        struct llama_context * ctx,
+               const uint8_t * kv_cache,
+                      size_t   n_size,
+                         int   n_token_count) {
+    // Make sure we have the same kv cache setup
+    LLAMA_ASSERT(ctx->model.kv_self.buf.size() == n_size);
+    memcpy(ctx->model.kv_self.buf.data(), kv_cache, n_size);
+    ctx->model.kv_self.n = n_token_count;
+}
+
 int llama_eval(
        struct llama_context * ctx,
           const llama_token * tokens,
@ -1731,7 +1705,11 @@ int llama_eval(
        fprintf(stderr, "%s: failed to eval\n", __func__);
        return 1;
    }
-
+    // get a more accurate load time, upon first eval
+    if (!ctx->has_evaluated_once) {
+        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
+        ctx->has_evaluated_once = true;
+    }
    return 0;
 }

@ -1796,9 +1774,9 @@ llama_token llama_sample_top_p_top_k(
      const llama_token * last_n_tokens_data,
                    int   last_n_tokens_size,
                    int   top_k,
-                 double   top_p,
-                 double   temp,
-                 double   repeat_penalty) {
+                  float   top_p,
+                  float   temp,
+                  float   repeat_penalty) {
    const int64_t t_start_sample_us = ggml_time_us();

    llama_token result = 0;
@ -1824,21 +1802,20 @@ llama_token llama_sample_top_p_top_k(
 void llama_print_timings(struct llama_context * ctx) {
    const int64_t t_end_us = ggml_time_us();

-    const int32_t n_sample = std::max(1, ctx->n_sample);
-    const int32_t n_eval   = std::max(1, ctx->n_eval);
-    const int32_t n_p_eval = std::max(1, ctx->n_p_eval);
+    const int32_t n_sample = Max(1, ctx->n_sample);
+    const int32_t n_eval   = Max(1, ctx->n_eval);
+    const int32_t n_p_eval = Max(1, ctx->n_p_eval);

    fprintf(stderr, "\n");
-    fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
-    fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3f * ctx->t_sample_us, n_sample, 1e-3f * ctx->t_sample_us / n_sample);
-    fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3f * ctx->t_p_eval_us, n_p_eval, 1e-3f * ctx->t_p_eval_us / n_p_eval);
-    fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3f * ctx->t_eval_us,   n_eval,   1e-3f * ctx->t_eval_us   / n_eval);
-    fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
+    fprintf(stderr, "%s:        load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0);
+    fprintf(stderr, "%s:      sample time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample);
+    fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval);
+    fprintf(stderr, "%s:        eval time = %8.2f ms / %5d runs   (%8.2f ms per run)\n",   __func__, 1e-3 * ctx->t_eval_us,   n_eval,   1e-3 * ctx->t_eval_us   / n_eval);
+    fprintf(stderr, "%s:       total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0);
 }

 void llama_reset_timings(struct llama_context * ctx) {
    ctx->t_start_us = ggml_time_us();
-
    ctx->t_sample_us = ctx->n_sample = 0;
    ctx->t_eval_us   = ctx->n_eval   = 0;
    ctx->t_p_eval_us = ctx->n_p_eval = 0;
--- a/llama.h
+++ b/llama.h
@ -6,7 +6,7 @@
 #include <stdbool.h>

 #ifdef LLAMA_SHARED
-#    ifdef _WIN32
+#    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef LLAMA_BUILD
 #            define LLAMA_API __declspec(dllexport)
 #        else
@ -20,7 +20,7 @@
 #endif

 #define LLAMA_FILE_VERSION 1
-#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
+#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
 #define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files

 #ifdef __cplusplus
@ -45,7 +45,7 @@ extern "C" {

    } llama_token_data;

-    typedef void (*llama_progress_callback)(double progress, void *ctx);
+    typedef void (*llama_progress_callback)(float progress, void *ctx);

    struct llama_context_params {
        int n_ctx;   // text context
@ -81,8 +81,24 @@ extern "C" {
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
-                   int   itype,
-                   int   qk);
+                   int   itype);
+
+    // Returns the KV cache that will contain the context for the
+    // ongoing prediction with the model.
+    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
+
+    // Returns the size of the KV cache
+    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
+
+    // Returns the number of tokens in the KV cache
+    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
+
+    // Sets the KV cache containing the current context for the model
+    LLAMA_API void llama_set_kv_cache(
+            struct llama_context * ctx,
+                   const uint8_t * kv_cache,
+                          size_t   n_size,
+                             int   n_token_count);

    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
@ -135,9 +151,9 @@ extern "C" {
          const llama_token * last_n_tokens_data,
                        int   last_n_tokens_size,
                        int   top_k,
-                     double   top_p,
-                     double   temp,
-                     double   repeat_penalty);
+                      float   top_p,
+                      float   temp,
+                      float   repeat_penalty);

    // Performance information
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
--- a/migrate-ggml-2023-03-30-pr613.py
+++ b/migrate-ggml-2023-03-30-pr613.py
@ -0,0 +1,311 @@
+# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic
+#
+# We caused a breaking change to the file format on 2023-03-30 in:
+#     https://github.com/ggerganov/llama.cpp/pull/613
+#
+# (1) If you still have the Meta LLaMA .pth files, then close this
+#     file now; you can just run `convert-pth-to-ggml.py` again to
+#     migrate to the new format. The tool is easier to use too. It
+#     isn't necessary anymore to manage split output files because
+#     the new format always combines things into a single file.
+#
+# (2) If you deleted the Meta LLaMA .pth files due to save on disk
+#     space, then this tool is intended to help you.  Please check
+#     out the instructions below.
+#
+# USAGE
+#
+#     python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT
+#
+# PREREQUISITES
+#
+#     pip install numpy
+#     cd llama.cpp
+#     make -j4
+#
+# EXAMPLE (7B MODEL)
+#
+#     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
+#     python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin
+#
+#     # check that it works
+#     ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
+#
+#     # you can delete the old files
+#     rm -f models/7B/ggml-model-f16.bin
+#     mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin
+#
+# EXAMPLE (13B MODEL)
+#
+#     # you can replace all the 'f16' with 'q4_0' if you're using quantized weights
+#     python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin
+#
+#     # check that it works
+#     ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?'
+#
+#     # you can delete the old files
+#     rm -f models/13B/ggml-model-f16.bin*
+#     mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin
+#
+
+import argparse
+import os
+import sys
+import json
+import struct
+import numpy as np
+
+QK = 32
+
+GGML_TYPE_Q4_0  = 0
+GGML_TYPE_Q4_1  = 1
+GGML_TYPE_I8    = 2
+GGML_TYPE_I16   = 3
+GGML_TYPE_I32   = 4
+GGML_TYPE_F16   = 5
+GGML_TYPE_F32   = 6
+
+WTYPE_NAMES = {
+    0: "F32",
+    1: "F16",
+    2: "Q4_0",
+    3: "Q4_1",
+}
+
+WTYPES = {
+    0: GGML_TYPE_F32,
+    1: GGML_TYPE_F16,
+    2: GGML_TYPE_Q4_0,
+    3: GGML_TYPE_Q4_1,
+}
+
+GGML_BLCK_SIZE = {
+    GGML_TYPE_Q4_0:  QK,
+    GGML_TYPE_Q4_1:  QK,
+    GGML_TYPE_I8:    1,
+    GGML_TYPE_I16:   1,
+    GGML_TYPE_I32:   1,
+    GGML_TYPE_F16:   1,
+    GGML_TYPE_F32:   1,
+}
+
+GGML_TYPE_SIZE = {
+    GGML_TYPE_Q4_0: 4   + QK//2,
+    GGML_TYPE_Q4_1: 4*2 + QK//2,
+    GGML_TYPE_I8:   1,
+    GGML_TYPE_I16:  2,
+    GGML_TYPE_I32:  4,
+    GGML_TYPE_F16:  2,
+    GGML_TYPE_F32:  4,
+}
+
+HPARAMS = [
+    'magic',    # int32
+    'version',  # int32
+    'n_vocab',  # int32
+    'n_embd',   # int32
+    'n_mult',   # int32
+    'n_head',   # int32
+    'n_layer',  # int32
+    'n_rot',    # int32
+    'f16',      # int32
+]
+
+def read_hparams(fin):
+    struct_fmt = "i" * len(HPARAMS)
+    struct_size = struct.calcsize(struct_fmt)
+    buf = fin.read(struct_size)
+    ints = struct.unpack(struct_fmt, buf)
+    hparams = dict(zip(HPARAMS, ints))
+    return hparams
+
+def write_hparams(fout, hparams):
+    struct_fmt = "i" * len(HPARAMS)
+    struct_size = struct.calcsize(struct_fmt)
+    ints = [hparams[h] for h in HPARAMS]
+    fout.write(struct.pack(struct_fmt, *ints))
+
+def read_tokens(fin, hparams):
+    tokens = []
+    for i in range(hparams['n_vocab']):
+        len_b = fin.read(4)
+        (length,) = struct.unpack("i", len_b)
+        word = fin.read(length)
+        score_b = fin.read(4)
+        (score,) = struct.unpack("f", score_b)
+        tokens.append((word, score))
+    return tokens
+
+def write_tokens(fout, tokens):
+    for word, score in tokens:
+        fout.write(struct.pack("i", len(word)))
+        fout.write(word)
+        fout.write(struct.pack("f", score))
+
+def ggml_nelements(shape):
+    r = 1
+    for i in shape:
+        r *= i
+    return r
+
+def ggml_nbytes(shape, ftype):
+    x = ggml_nelements(shape)
+    t = WTYPES[ftype]
+    x *= GGML_TYPE_SIZE[t]
+    x //= GGML_BLCK_SIZE[t]
+    return x
+
+def copy_tensors(fin, fout, part_id, n_parts):
+    while True:
+
+        b = fin.read(4)
+        if not b: break
+        (n_dims,) = struct.unpack("i", b)
+        b = fin.read(4)
+        (length,) = struct.unpack("i", b)
+        b = fin.read(4)
+        (ftype,) = struct.unpack("i", b)
+
+        assert n_dims in (1, 2)
+
+        partshape = list(range(n_dims))
+        for i in range(n_dims):
+            b = fin.read(4)
+            partshape[i] = struct.unpack("i", b)[0]
+        partshape = list(reversed(partshape))
+
+        name = fin.read(length)
+        data = fin.read(ggml_nbytes(partshape, ftype))
+
+        blck_size = GGML_BLCK_SIZE[WTYPES[ftype]]
+        type_size = GGML_TYPE_SIZE[WTYPES[ftype]]
+
+        print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}")
+
+        # determine dimension along which multipart tensor is sharded
+        #
+        # split_dim 0 regex:
+        #   - output.*
+        #   - layers.*.attention.wq.weight
+        #   - layers.*.attention.wk.weight
+        #   - layers.*.attention.wv.weight
+        #   - layers.*.feed_forward.w1.weight
+        #   - layers.*.feed_forward.w3.weight
+        #
+        # split_dim 1 regex:
+        #   - tok_embeddings.*
+        #   - layers.*.attention.wo.weight
+        #   - layers.*.feed_forward.w2.weight
+        #
+        if n_dims > 1:
+            split_dim = 1
+            if b"tok_embeddings" in name:
+                split_dim = 1
+            elif b"layers" in name:
+                if b"attention.wo.weight" in name:
+                    split_dim = 1
+                elif b"feed_forward.w2.weight" in name:
+                    split_dim = 1
+                else:
+                    split_dim = 0
+            elif b"output" in name:
+                split_dim = 0
+
+        # output tensor header
+        fullshape = list(partshape)
+        if n_dims > 1:
+            fullshape[split_dim] *= n_parts
+        fout.write(struct.pack("iii", n_dims, len(name), ftype))
+        for dim in reversed(fullshape):
+            fout.write(struct.pack("i", dim))
+        fout.write(name)
+
+        # ensure tensor data is aligned
+        tensor_data_offset = fout.tell()
+        while tensor_data_offset % QK != 0:
+            fout.write(struct.pack("B", 0))
+            tensor_data_offset += 1
+
+        # output unified mappable tensor data
+        if n_dims == 1 or n_parts == 1:
+            # copy tensor which we thankfully received in one piece
+            if part_id == 0:
+                fout.write(data)
+        elif split_dim == 0:
+            # reassemble multifile tensor containing some of the rows
+            rows_per_chunk = partshape[0]
+            current_row = part_id * rows_per_chunk
+            bytes_per_row = fullshape[1] // blck_size * type_size
+            offset = current_row * bytes_per_row
+            fout.seek(tensor_data_offset + offset)
+            fout.write(data)
+        elif split_dim == 1:
+            # reassemble multifile tensor containing some of the cols
+            cols_per_chunk = partshape[1]
+            current_col = part_id * cols_per_chunk
+            bpr = partshape[1] // blck_size * type_size
+            bytes_per_row = fullshape[1] // blck_size * type_size
+            offset_current_col = current_col // blck_size * type_size
+            for row in range(partshape[0]):
+                offset_row = row * bytes_per_row
+                offset = offset_row + offset_current_col
+                fout.seek(tensor_data_offset + offset)
+                fout.write(data[row * bpr:row * bpr + bpr])
+
+        # advance file position to next tensor
+        fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype))
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format')
+    parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)')
+    parser.add_argument('fout_path', help='your new ggjt file name')
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+    assert args.fin_path
+    assert args.fout_path
+    assert args.fin_path != args.fout_path
+
+    with open(args.fin_path, "rb") as fin:
+        hparams = read_hparams(fin)
+        tokens = read_tokens(fin, hparams)
+
+    if hparams['magic'] == 0x67676a74:  # ggjt
+        print(f"{args.fin_path}: input ggml has already been converted to 'ggjt' magic\n")
+        sys.exit(1)
+
+    if hparams['magic'] != 0x67676d66:  # ggmf
+        print(f"{args.fin_path}: input ggml file doesn't have expected 'ggmf' magic: {hparams['magic']:#x}\n")
+        sys.exit(1)
+
+    hparams['magic'] = 0x67676a74  # ggjt
+
+    # count number of multipart files by convention
+    n_parts = 1
+    while True:
+        if os.path.exists(f"{args.fin_path}.{n_parts}"):
+            n_parts += 1
+        else:
+            break
+
+    # we output a single file for ggml
+    with open(args.fout_path, "wb") as fout:
+        write_hparams(fout, hparams)
+        write_tokens(fout, tokens)
+        offset_of_tensors = fout.tell()
+        # the tensors we load could be split across multiple files
+        for part_id in range(n_parts):
+            fout.seek(offset_of_tensors)
+            print(f"Processing part {part_id+1} of {n_parts}\n")
+            fin_path = args.fin_path
+            if part_id > 0:
+                fin_path += f".{part_id}"
+            with open(fin_path, "rb") as fin:
+                read_tokens(fin, read_hparams(fin))
+                copy_tensors(fin, fout, part_id, n_parts)
+
+    print(f"Done. Output file: {args.fout_path}\n")
+
+if __name__ == "__main__":
+    main()
--- a/models/ggml-vocab.bin
+++ b/models/ggml-vocab.bin
--- a/prompts/reason-act.txt
+++ b/prompts/reason-act.txt
@ -0,0 +1,18 @@
+You run in a loop of Thought, Action, Observation.
+At the end of the loop either Answer or restate your Thought and Action.
+Use Thought to describe your thoughts about the question you have been asked.
+Use Action to run one of these actions available to you:
+- calculate[python math expression]
+Observation will be the result of running those actions
+
+
+Question: What is 4 * 7 / 3?
+Thought: Do I need to use an action? Yes, I use calculate to do math
+Action: calculate[4 * 7 / 3]
+Observation: 9.3333333333
+Thought: Do I need to use an action? No, have the result
+Answer: The calculate tool says it is 9.3333333333
+Question: What is capital of france?
+Thought: Do I need to use an action? No, I know the answer
+Answer: Paris is the capital of France
+Question:
--- a/quantize.py
+++ b/quantize.py
@ -1,127 +0,0 @@
-#!/usr/bin/env python3
-
-"""Script to execute the "quantize" script on a given set of models."""
-
-import subprocess
-import argparse
-import glob
-import sys
-import os
-
-
-def main():
-    """Update the quantize binary name depending on the platform and parse
-    the command line arguments and execute the script.
-    """
-
-    if "linux" in sys.platform or "darwin" in sys.platform:
-        quantize_script_binary = "quantize"
-
-    elif "win32" in sys.platform or "cygwin" in sys.platform:
-        quantize_script_binary = "quantize.exe"
-
-    else:
-        print("WARNING: Unknown platform. Assuming a UNIX-like OS.\n")
-        quantize_script_binary = "quantize"
-
-    parser = argparse.ArgumentParser(
-        prog='python3 quantize.py',
-        description='This script quantizes the given models by applying the '
-        f'"{quantize_script_binary}" script on them.'
-    )
-    parser.add_argument(
-        'models', nargs='+', choices=('7B', '13B', '30B', '65B'),
-        help='The models to quantize.'
-    )
-    parser.add_argument(
-        '-r', '--remove-16', action='store_true', dest='remove_f16',
-        help='Remove the f16 model after quantizing it.'
-    )
-    parser.add_argument(
-        '-m', '--models-path', dest='models_path',
-        default=os.path.join(os.getcwd(), "models"),
-        help='Specify the directory where the models are located.'
-    )
-    parser.add_argument(
-        '-q', '--quantize-script-path', dest='quantize_script_path',
-        default=os.path.join(os.getcwd(), quantize_script_binary),
-        help='Specify the path to the "quantize" script.'
-    )
-
-    # TODO: Revise this code
-    # parser.add_argument(
-    #     '-t', '--threads', dest='threads', type='int',
-    #     default=os.cpu_count(),
-    #     help='Specify the number of threads to use to quantize many models at '
-    #     'once. Defaults to os.cpu_count().'
-    # )
-
-    args = parser.parse_args()
-    args.models_path = os.path.abspath(args.models_path)
-
-    if not os.path.isfile(args.quantize_script_path):
-        print(
-            f'The "{quantize_script_binary}" script was not found in the '
-            "current location.\nIf you want to use it from another location, "
-            "set the --quantize-script-path argument from the command line."
-        )
-        sys.exit(1)
-
-    for model in args.models:
-        # The model is separated in various parts
-        # (ggml-model-f16.bin, ggml-model-f16.bin.0, ggml-model-f16.bin.1...)
-        f16_model_path_base = os.path.join(
-            args.models_path, model, "ggml-model-f16.bin"
-        )
-
-        f16_model_parts_paths = map(
-            lambda filename: os.path.join(f16_model_path_base, filename),
-            glob.glob(f"{f16_model_path_base}*")
-        )
-
-        for f16_model_part_path in f16_model_parts_paths:
-            if not os.path.isfile(f16_model_part_path):
-                print(
-                    f"The f16 model {os.path.basename(f16_model_part_path)} "
-                    f"was not found in {args.models_path}{os.path.sep}{model}"
-                    ". If you want to use it from another location, set the "
-                    "--models-path argument from the command line."
-                )
-                sys.exit(1)
-
-            __run_quantize_script(
-                args.quantize_script_path, f16_model_part_path
-            )
-
-            if args.remove_f16:
-                os.remove(f16_model_part_path)
-
-
-# This was extracted to a top-level function for parallelization, if
-# implemented. See https://github.com/ggerganov/llama.cpp/pull/222/commits/f8db3d6cd91bf1a1342db9d29e3092bc12dd783c#r1140496406
-
-def __run_quantize_script(script_path, f16_model_part_path):
-    """Run the quantize script specifying the path to it and the path to the
-    f16 model to quantize.
-    """
-
-    new_quantized_model_path = f16_model_part_path.replace("f16", "q4_0")
-    subprocess.run(
-        [script_path, f16_model_part_path, new_quantized_model_path, "2"],
-        check=True
-    )
-
-
-if __name__ == "__main__":
-    try:
-        main()
-
-    except subprocess.CalledProcessError:
-        print("\nAn error ocurred while trying to quantize the models.")
-        sys.exit(1)
-
-    except KeyboardInterrupt:
-        sys.exit(0)
-
-    else:
-        print("\nSuccesfully quantized all models.")
--- a/spm-headers/llama.h
+++ b/spm-headers/llama.h
@ -0,0 +1 @@
+../llama.h
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -5,5 +5,6 @@ function(llama_add_test source)
    add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
 endfunction()

+# llama_add_test(test-double-float.c) # SLOW
 llama_add_test(test-quantize.c)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
--- a/tests/test-double-float.c
+++ b/tests/test-double-float.c
@ -0,0 +1,53 @@
+// These tests may take a long time!
+// They are to prove that conversion from double to float of various functions in ggml.c doesn't affect the result.
+// This is done by checking all finite (non-NaN, non-infinite) floats.
+
+#undef NDEBUG
+#include <assert.h>
+#include <immintrin.h>
+#include <math.h>
+#include <stdint.h>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdouble-promotion"
+
+// ggml.c::quantize_row_q4_0_reference
+inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
+
+// ggml.c::ggml_silu_f32
+inline static float silu_orig(float x) {
+    return x/(1.0 + exp(-x));
+}
+
+#pragma GCC diagnostic pop
+
+// ggml.c::quantize_row_q4_0_reference
+inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
+
+// ggml.c::ggml_silu_f32
+inline static float silu_float(float x) {
+    return x/(1.0f + expf(-x));
+}
+
+int main(void) {
+    uint32_t x = UINT32_MAX;
+    do {
+        float f = *(float *)&x;
+        assert(!isfinite(f) || (round_orig(f) == round_float(f)));
+    } while (x--);
+
+#ifdef __F16C__
+    // GELU and SILU implementations are used with a FP16 lookup table.
+    // The original and float-only results are not equal for all inputs after converting to FP16.
+    // GELU is an approximation anyway (tanh), not tested here.
+    // For SILU, verify that the results are at least the closest floating point numbers, if the FP16 values don't match.
+    for (x = 0; x <= UINT16_MAX; x++) {
+        float f = _cvtsh_ss(x);
+        const float so = silu_orig(f);
+        const float sf = silu_float(f);
+        assert(   (_cvtss_sh(so, 0) == _cvtss_sh(sf, 0))
+               || (nextafterf(so, sf) == sf)
+               || (nextafterf(sf, so) == so));
+    }
+#endif
+}
--- a/tests/test-quantize.c
+++ b/tests/test-quantize.c
@ -13,7 +13,7 @@ int main(void) {
        src[i] = (float)(i + 1);
    }

-    size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist);
+    size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist);
    assert(size == 20);
    float max_result = ((float *)dst)[0];
    float max_expected = src[31] / ((1 << 3) - 1);
@ -24,7 +24,7 @@ int main(void) {
        assert(q4_result == q4_expected);
    }

-    size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist);
+    size = ggml_quantize_q4_1(src, dst, QK, QK, hist);
    assert(size == 24);
    float delta_result = ((float *)dst)[0];
    float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1);
--- a/tests/test-tokenizer-0.cpp
+++ b/tests/test-tokenizer-0.cpp
@ -77,5 +77,7 @@ int main(int argc, char **argv) {
        }
    }

+    llama_free(ctx);
+
    return 0;
 }