Merge branch 'ggerganov:master' into master

2023-05-10 09:19:12 -05:00 · 2023-05-10 09:19:12 -05:00 · 95218177b0
commit 95218177b0
parent 46da5de195 e6a46b0ed1
18 changed files with 1137 additions and 252 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -120,7 +120,7 @@ jobs:
          make

  macOS-latest-cmake:
-    runs-on: macOS-latest
+    runs-on: macos-latest

    steps:
      - name: Clone
@ -148,22 +148,64 @@ jobs:

  windows-latest-cmake:
    runs-on: windows-latest
+    env:
+      OPENBLAS_VERSION: 0.3.23
+      OPENCL_VERSION: 2023.04.17
+      CLBLAST_VERSION: 1.5.3

    strategy:
      matrix:
        include:
-         - build: 'avx2'
-           defines: ''
-         - build: 'avx'
-           defines: '-DLLAMA_AVX2=OFF'
-         - build: 'avx512'
-           defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+          - build: 'avx2'
+            defines: ''
+          - build: 'avx'
+            defines: '-DLLAMA_AVX2=OFF'
+          - build: 'avx512'
+            defines: '-DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+          - build: 'clblast'
+            defines: '-DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
+          - build: 'openblas'
+            defines: '-DLLAMA_OPENBLAS=ON -DBLAS_LIBRARIES="/LIBPATH:$env:RUNNER_TEMP/openblas/lib" -DOPENBLAS_INC="$env:RUNNER_TEMP/openblas/include"'

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v1

+      - name: Download OpenCL SDK
+        id: get_opencl
+        if: ${{ matrix.build == 'clblast' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
+          mkdir $env:RUNNER_TEMP/opencl
+          tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
+
+      - name: Download CLBlast
+        id: get_clblast
+        if: ${{ matrix.build == 'clblast' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/clblast.zip -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-Windows-x64.zip"
+          curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
+          mkdir $env:RUNNER_TEMP/clblast
+          tar.exe -xvf $env:RUNNER_TEMP/clblast.zip -C $env:RUNNER_TEMP/clblast
+          foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
+            $txt = Get-Content -Path $f -Raw
+            $txt.Replace('C:/dependencies/opencl/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
+          }
+
+      - name: Download OpenBLAS
+        id: get_openblas
+        if: ${{ matrix.build == 'openblas' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
+          curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
+          mkdir $env:RUNNER_TEMP/openblas
+          tar.exe -xvf $env:RUNNER_TEMP/openblas.zip -C $env:RUNNER_TEMP/openblas
+          $vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
+          $msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
+          $lib =  $(join-path $msvc 'bin\Hostx64\x64\lib.exe')
+          & $lib /machine:x64 "/def:${env:RUNNER_TEMP}/openblas/lib/libopenblas.def" "/out:${env:RUNNER_TEMP}/openblas/lib/openblas.lib" /name:openblas.dll
+
      - name: Build
        id: cmake_build
        run: |
@ -171,6 +213,21 @@ jobs:
          cd build
          cmake .. ${{ matrix.defines }}
          cmake --build . --config Release
+          cp ../LICENSE ./bin/Release/llama.cpp.txt
+
+      - name: Add clblast.dll
+        id: add_clblast_dll
+        if: ${{ matrix.build == 'clblast' }}
+        run: |
+          cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
+          cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
+
+      - name: Add libopenblas.dll
+        id: add_libopenblas_dll
+        if: ${{ matrix.build == 'openblas' }}
+        run: |
+          cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
+          cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt

      - name: Check AVX512F support
        id: check_avx512f
@ -187,7 +244,7 @@ jobs:

      - name: Test
        id: cmake_test
-        if: ${{ matrix.build != 'avx512' || env.HAS_AVX512F == '1' }} # Test AVX-512 only when possible
+        if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
        run: |
          cd build
          ctest -C Release --verbose
@ -210,6 +267,82 @@ jobs:
          path: |
            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

+  windows-latest-cmake-cublas:
+    runs-on: windows-latest
+
+    strategy:
+      matrix:
+        cuda: ['12.1.0', '11.7.1']
+        build: ['cublas']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - uses: Jimver/cuda-toolkit@v0.2.10
+        id: cuda-toolkit
+        with:
+          cuda: ${{ matrix.cuda }}
+          # TODO(green-sky): _dev seems to fail, and non dev are not enought
+          #sub-packages: '["nvcc", "cudart", "cublas", "cudart_dev", "cublas_dev"]'
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_CUBLAS=ON
+          cmake --build . --config Release
+
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip .\build\bin\Release\*
+
+      - name: Upload artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip
+
+      - name: Copy and pack Cuda runtime
+        if: ${{ matrix.cuda == '12.1.0' }}
+        # TODO(green-sky): paths are cuda 12 specific
+        run: |
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          mkdir '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_12.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_12.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_12.dll" '.\build\bin\cudart\'
+          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
+
+      - name: Copy and pack Cuda runtime
+        if: ${{ matrix.cuda == '11.7.1' }}
+        # TODO(green-sky): paths are cuda 11 specific
+        run: |
+          echo "Cuda install location: ${{steps.cuda-toolkit.outputs.CUDA_PATH}}"
+          mkdir '.\build\bin\cudart\'
+          ls "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin"
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cudart64_110.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublas64_11.dll" '.\build\bin\cudart\'
+          cp "${{steps.cuda-toolkit.outputs.CUDA_PATH}}\bin\cublasLt64_11.dll" '.\build\bin\cudart\'
+          7z a cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip .\build\bin\cudart\*
+
+      - name: Upload Cuda runtime
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-artifact@v3
+        with:
+          path: |
+            cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip
+
  release:
    if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}

@ -221,6 +354,7 @@ jobs:
      - macOS-latest-make
      - macOS-latest-cmake
      - windows-latest-cmake
+      - windows-latest-cmake-cublas

    steps:
      - name: Download artifacts
--- a/.gitignore
+++ b/.gitignore
@ -21,6 +21,7 @@ build-sanitize-addr/
 build-sanitize-thread/

 models/*
+*.bin

 /main
 /quantize
@ -42,5 +43,6 @@ zig-out/
 zig-cache/

 ppl-*.txt
+qnt-*.txt

 examples/jeopardy/results.txt
--- a/13
+++ b/13
@ -107,7 +107,11 @@ ifndef LLAMA_NO_ACCELERATE
 endif
 ifdef LLAMA_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
-	LDFLAGS += -lopenblas
+	ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
+		LDFLAGS += -lopenblas -lcblas
+	else
+		LDFLAGS += -lopenblas
+	endif
 endif
 ifdef LLAMA_CUBLAS
 	CFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
@ -121,7 +125,12 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 endif
 ifdef LLAMA_CLBLAST
 	CFLAGS  += -DGGML_USE_CLBLAST
-	LDFLAGS += -lclblast -lOpenCL
+	# Mac provides OpenCL as a framework
+	ifeq ($(UNAME_S),Darwin)
+		LDFLAGS += -lclblast -framework OpenCL
+	else
+		LDFLAGS += -lclblast -lOpenCL
+	endif
 	OBJS    += ggml-opencl.o
 ggml-opencl.o: ggml-opencl.c ggml-opencl.h
 	$(CC) $(CFLAGS) -c $< -o $@
--- a/README.md
+++ b/README.md
@ -7,21 +7,64 @@

 Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++

+## ⚠️ TEMPORARY NOTICE ABOUT UPCOMING BREAKING CHANGE ⚠️
+
+**The quantization formats will soon be updated: https://github.com/ggerganov/llama.cpp/pull/1305**
+
+**All `ggml` model files using the old format will not work with the latest `llama.cpp` code after that change is merged**
+
+---
+
 **Hot topics:**

 - [Roadmap May 2023](https://github.com/ggerganov/llama.cpp/discussions/1220)
 - [New quantization methods](https://github.com/ggerganov/llama.cpp#quantization)

+<details>
+  <summary>Table of Contents</summary>
+  <ol>
+    <li>
+      <a href="#description">Description</a>
+    </li>
+    <li>
+      <a href="#usage">Usage</a>
+      <ul>
+        <li><a href="#get-the-code">Get the Code</a></li>
+        <li><a href="#build">Build</a></li>
+        <li><a href="#blas-build">BLAS Build</a></li>
+        <li><a href="#prepare-data--run">Prepare Data & Run</a></li>
+        <li><a href="#memorydisk-requirements">Memory/Disk Requirements</a></li>
+        <li><a href="#quantization">Quantization</a></li>
+        <li><a href="#interactive-mode">Interactive mode</a></li>
+        <li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
+        <li><a href="#using-gpt4all">Using GPT4All</a></li>
+        <li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
+        <li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
+        <li><a href="#verifying-the-model-files">Verifying the model files</a></li>
+        <li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
+        <li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
+        <li><a href="#android">Android</a></li>
+        <li><a href="#docker">Docker</a></li>
+      </ul>
+    </li>
+    <li><a href="#contributing">Contributing</a></li>
+    <li><a href="#coding-guidelines">Coding guidelines</a></li>
+    <li><a href="#docs">Docs</a></li>
+  </ol>
+</details>
+
 ## Description

 The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quantization on a MacBook

 - Plain C/C++ implementation without dependencies
 - Apple silicon first-class citizen - optimized via ARM NEON and Accelerate framework
- AVX2 support for x86 architectures
+- AVX, AVX2 and AVX512 support for x86 architectures
 - Mixed F16 / F32 precision
- 4-bit integer quantization support
+- 4-bit, 5-bit and 8-bit integer quantization support
 - Runs on the CPU
+- OpenBLAS support
+- cuBLAS and CLBlast support

 The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
 Since then, the project has improved significantly thanks to many contributions. This project is for educational purposes and serves
@ -43,6 +86,8 @@ as the main playground for developing new features for the [ggml](https://github
 - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
 - [X] [Vicuna](https://github.com/ggerganov/llama.cpp/discussions/643#discussioncomment-5533894)
 - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
+- [X] [OpenBuddy 🐶 (Multilingual)](https://github.com/OpenBuddy/OpenBuddy)
+- [X] [Pygmalion 7B / Metharme 7B](#using-pygmalion-7b--metharme-7b)

 **Bindings:**

@ -213,7 +258,6 @@ Building the program with BLAS support may lead to some performance improvements
      ```bash
      make LLAMA_OPENBLAS=1
      ```
-      Note: In order to build on Arch Linux with OpenBLAS support enabled you must edit the Makefile adding at the end of the line 105: `-lcblas`

    - On Windows:

@ -255,6 +299,8 @@ Building the program with BLAS support may lead to some performance improvements
    cmake --build . --config Release
    ```

+Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
+
 ### Prepare Data & Run

 ```bash
@ -294,17 +340,25 @@ Several quantization methods are supported. They differ in the resulting model d

 | Model | Measure      | F16    | Q4_0   | Q4_1   | Q4_2   | Q5_0   | Q5_1   | Q8_0   |
 |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
-|    7B | perplexity   | 5.9565 | 6.2103 | 6.1286 | 6.1698 | 6.0139 | 5.9934 | 5.9571 |
+|    7B | perplexity   | 5.9066 | 6.1620 | 6.0910 | 6.1466 | 5.9862 | 5.9481 | 5.9069 |
 |    7B | file size    |  13.0G |   4.0G |   4.8G |   4.0G |   4.4G |   4.8G |   7.1G |
 |    7B | ms/tok @ 4th |    128 |     56 |     61 |     84 |     91 |     95 |     75 |
 |    7B | ms/tok @ 8th |    128 |     47 |     55 |     48 |     53 |     59 |     75 |
 |    7B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.0 |    5.5 |    6.0 |    9.0 |
-|   13B | perplexity   | 5.2455 | 5.3748 | 5.3471 | 5.3433 | 5.2768 | 5.2582 | 5.2458 |
+|   13B | perplexity   | 5.2543 | 5.3863 | 5.3607 | 5.3513 | 5.2856 | 5.2706 | 5.2548 |
 |   13B | file size    |  25.0G |   7.6G |   9.1G |   7.6G |   8.4G |   9.1G |    14G |
 |   13B | ms/tok @ 4th |    239 |    104 |    113 |    160 |    176 |    185 |    141 |
 |   13B | ms/tok @ 8th |    240 |     85 |     99 |     97 |    108 |    117 |    147 |
 |   13B | bits/weight  |   16.0 |    5.0 |    6.0 |    5.0 |    5.5 |    6.0 |    9.0 |

+### Perplexity (measuring model quality)
+
+You can use the `perplexity` example to measure perplexity over a given prompt (lower perplexity is better).
+For more information, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity).
+
+The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
+The time per token is measured on a MacBook M1 Pro 32GB RAM using 4 and 8 threads.
+
 ### Interactive mode

 If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
@ -371,6 +425,19 @@ python3 convert.py models/gpt4all-7B/gpt4all-lora-quantized.bin

 - The newer GPT4All-J model is not yet supported!

+### Using Pygmalion 7B & Metharme 7B
+
+- Obtain the [LLaMA weights](#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data)
+- Obtain the [Pygmalion 7B](https://huggingface.co/PygmalionAI/pygmalion-7b/) or [Metharme 7B](https://huggingface.co/PygmalionAI/metharme-7b) XOR encoded weights
+- Convert the LLaMA model with [the latest HF convert script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py)
+- Merge the XOR files with the converted LLaMA weights by running the [xor_codec](https://huggingface.co/PygmalionAI/pygmalion-7b/blob/main/xor_codec.py) script
+- Convert to `ggml` format using the `convert.py` script in this repo:
+```bash
+python3 convert.py pygmalion-7b/ --outtype q4_1
+```
+> The Pygmalion 7B & Metharme 7B weights are saved in [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format) precision. If you wish to convert to `ggml` without quantizating, please specify the `--outtype` as `f32` instead of `f16`.
+
+
 ### Obtaining the Facebook LLaMA original model and Stanford Alpaca model data

 - **Under no circumstances should IPFS, magnet links, or any other links to model downloads be shared anywhere in this repository, including in issues, discussions, or pull requests. They will be immediately deleted.**
@ -403,26 +470,6 @@ If your issue is with model generation quality, then please at least scan the fo
    - [Aligning language models to follow instructions](https://openai.com/research/instruction-following)
    - [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)

-### Perplexity (measuring model quality)
-
-You can use the `perplexity` example to measure perplexity over the given prompt. For more background, see [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity). However, in general, lower perplexity is better for LLMs.
-
-#### Latest measurements
-
-The latest perplexity scores for the various model sizes and quantizations are being tracked in [discussion #406](https://github.com/ggerganov/llama.cpp/discussions/406). `llama.cpp` is measuring very well compared to the baseline implementations. Quantization has a small negative impact on quality, but, as you can see, running
-13B at q4_0 beats the 7B f16 model by a significant amount.
-
-All measurements are done against the wikitext2 test dataset (https://paperswithcode.com/dataset/wikitext-2), with default options (512 length context).
-Note that changing the context length will have a significant impact on perplexity (longer context = better perplexity).
-```
-Perplexity - model options
-5.5985 - 13B, q4_0
-5.9565 - 7B, f16
-6.3001 - 7B, q4_1
-6.5949 - 7B, q4_0
-6.5995 - 7B, q4_0, --memory_f16
-```
-
 #### How to run

 1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
--- a/convert.py
+++ b/convert.py
@ -67,6 +67,7 @@ FTYPE_TO_DATA_TYPE: Dict[int, DataType] = \
    {ftype: dtype for (dtype, ftype) in DATA_TYPE_TO_FTYPE.items()}

 DATA_TYPE_TO_NUMPY: Dict[DataType, 'np.dtype[Any]'] = {
+    DT_BF16: np.dtype(np.uint16),
    DT_F16: np.dtype(np.float16),
    DT_F32: np.dtype(np.float32),
    DT_I32: np.dtype(np.int32),
@ -276,6 +277,12 @@ class Tensor(metaclass=ABCMeta):
    def to_ggml(self) -> 'GGMLCompatibleTensor': ...


+def bf16_to_fp32(bf16_arr: np.ndarray) -> np.ndarray:
+    assert bf16_arr.dtype == np.uint16, f"Input array should be of dtype uint16, but got {bf16_arr.dtype}"
+    fp32_arr = bf16_arr.astype(np.uint32) << 16
+    return fp32_arr.view(np.float32)
+
+
 class UnquantizedTensor(Tensor):
    def __init__(self, ndarray: NDArray) -> None:
        assert isinstance(ndarray, np.ndarray)
@ -284,6 +291,8 @@ class UnquantizedTensor(Tensor):

    def astype(self, data_type: DataType) -> Tensor:
        dtype = DATA_TYPE_TO_NUMPY[data_type]
+        if self.data_type == DT_BF16:
+            self.ndarray = bf16_to_fp32(self.ndarray)
        return UnquantizedTensor(self.ndarray.astype(dtype))

    def to_ggml(self) -> 'UnquantizedTensor':
@ -686,6 +695,7 @@ class LazyUnpickler(pickle.Unpickler):
        description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}'
        return LazyStorage(load=load, kind=pid[1], description=description)

+   # @staticmethod
    def lazy_rebuild_tensor_v2(storage: Any, storage_offset: Any, size: Any, stride: Any,  # pyright: ignore[reportSelfClsParameterName]
                               requires_grad: Any, backward_hooks: Any, metadata: Any = None) -> LazyTensor:
        assert isinstance(storage, LazyStorage)
@ -696,12 +706,18 @@ class LazyUnpickler(pickle.Unpickler):
        description = f'pickled storage_offset={storage_offset} in {storage.description}'
        return LazyTensor(load, list(size), storage.kind.data_type, description)

+    # @staticmethod
+    def rebuild_from_type_v2(func, new_type, args, state):
+        return func(*args)
+
    CLASSES: Dict[Any, Any] = {
+        ('torch._tensor', '_rebuild_from_type_v2'): rebuild_from_type_v2,
        ('torch._utils', '_rebuild_tensor_v2'): lazy_rebuild_tensor_v2,
        ('torch', 'BFloat16Storage'): LazyStorageKind(DT_BF16),
        ('torch', 'HalfStorage'): LazyStorageKind(DT_F16),
        ('torch', 'FloatStorage'): LazyStorageKind(DT_F32),
        ('torch', 'IntStorage'): LazyStorageKind(DT_I32),
+        ('torch', 'Tensor'): LazyTensor,
    }

    def find_class(self, module: str, name: str) -> Any:
@ -750,7 +766,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus:
            return UnquantizedTensor(np.frombuffer(buf, dtype=numpy_dtype).reshape(shape))
        description = f'safetensors begin={begin} end={end} type={data_type} path={path}'
        return LazyTensor(load, shape, data_type, description)
-    model = {name: convert(info) for (name, info) in header.items()}
+    model = {name: convert(info) for (name, info) in header.items() if name != '__metadata__'}
    return ModelPlus(model=model, paths=[path], format='safetensors', vocab=None)


@ -961,7 +977,7 @@ class OutputFile:

 def pick_output_type(model: LazyModel, output_type_str: Optional[str]) -> GGMLFileType:
    wq_type = model["layers.0.attention.wq.weight"].data_type
-    if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
+    if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
        return GGMLFileType.AllF32
    if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
        return GGMLFileType.MostlyF16
@ -1035,8 +1051,12 @@ def load_some_model(path: Path) -> ModelPlus:
    '''Load a model of any supported format.'''
    # Be extra-friendly and accept either a file or a directory:
    if path.is_dir():
-        globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"]
-        files = [file for glob in globs for file in path.glob(glob)]
+        # Check if it's a set of safetensors files first
+        files = list(path.glob("model-00001-of-*.safetensors"))
+        if not files:
+            # Try the PyTorch patterns too, with lower priority
+            globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt"]
+            files = [file for glob in globs for file in path.glob(glob)]
        if not files:
            # Try GGML too, but with lower priority, since if both a non-GGML
            # model and a GGML model exist in the same directory, we assume the
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -14,20 +14,16 @@
 #include <sys/sysctl.h>
 #endif

-#if defined (_WIN32)
+#if defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <windows.h>
 #include <fcntl.h>
 #include <io.h>
-#pragma comment(lib,"kernel32.lib")
-extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle);
-extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode);
-extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode);
-extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID);
-extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID);
-extern "C" __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int CodePage, unsigned long dwFlags,
-                                                                   const wchar_t * lpWideCharStr, int cchWideChar,
-                                                                   char * lpMultiByteStr, int cbMultiByte,
-                                                                   const char * lpDefaultChar, bool * lpUsedDefaultChar);
-#define CP_UTF8 65001
+#else
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <wchar.h>
 #endif

 int32_t get_num_physical_cores() {
@ -66,35 +62,33 @@ int32_t get_num_physical_cores() {
    return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }

-std::string process_escapes(const char* input) {
-    std::string output;
+void process_escapes(std::string& input) {
+    std::size_t input_len = input.length();
+    std::size_t output_idx = 0;

-    if (input != nullptr) {
-        std::size_t input_len = std::strlen(input);
-        output.reserve(input_len);
-
-        for (std::size_t i = 0; i < input_len; ++i) {
-            if (input[i] == '\\' && i + 1 < input_len) {
-                switch (input[++i]) {
-                    case 'n':  output.push_back('\n'); break;
-                    case 't':  output.push_back('\t'); break;
-                    case '\'': output.push_back('\''); break;
-                    case '\"': output.push_back('\"'); break;
-                    case '\\': output.push_back('\\'); break;
-                    default:   output.push_back('\\');
-                               output.push_back(input[i]); break;
-                }
-            } else {
-                output.push_back(input[i]);
+    for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
+        if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
+            switch (input[++input_idx]) {
+                case 'n':  input[output_idx++] = '\n'; break;
+                case 'r':  input[output_idx++] = '\r'; break;
+                case 't':  input[output_idx++] = '\t'; break;
+                case '\'': input[output_idx++] = '\''; break;
+                case '\"': input[output_idx++] = '\"'; break;
+                case '\\': input[output_idx++] = '\\'; break;
+                default:   input[output_idx++] = '\\';
+                           input[output_idx++] = input[input_idx]; break;
            }
+        } else {
+            input[output_idx++] = input[input_idx];
        }
    }

-    return output;
+    input.resize(output_idx);
 }

 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    bool invalid_param = false;
+    bool escape_prompt = false;
    std::string arg;
    gpt_params default_params;

@ -102,6 +96,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        arg = argv[i];

        if (arg == "-s" || arg == "--seed") {
+#if defined(GGML_USE_CUBLAS)
+            fprintf(stderr, "WARNING: when using cuBLAS generation results are NOT guaranteed to be reproducible.\n");
+#endif
            if (++i >= argc) {
                invalid_param = true;
                break;
@ -118,7 +115,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                invalid_param = true;
                break;
            }
-            params.prompt = process_escapes(argv[i]);
+            params.prompt = argv[i];
+        } else if (arg == "-e") {
+            escape_prompt = true;
        } else if (arg == "--session") {
            if (++i >= argc) {
                invalid_param = true;
@ -266,6 +265,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
            params.interactive_first = true;
        } else if (arg == "-ins" || arg == "--instruct") {
            params.instruct = true;
+        } else if (arg == "--multiline-input") {
+            params.multiline_input = true;
        } else if (arg == "--color") {
            params.use_color = true;
        } else if (arg == "--mlock") {
@ -324,6 +325,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                break;
            }
            params.input_prefix = argv[i];
+        } else if (arg == "--in-suffix") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.input_suffix = argv[i];
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, default_params);
@ -335,6 +342,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
        gpt_print_usage(argc, argv, default_params);
        exit(1);
    }
+    if (escape_prompt) {
+        process_escapes(params.prompt);
+    }

    return true;
 }
@ -347,6 +357,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
    fprintf(stderr, "  --interactive-first   run in interactive mode and wait for input right away\n");
    fprintf(stderr, "  -ins, --instruct      run in instruction mode (use with Alpaca models)\n");
+    fprintf(stderr, "  --multiline-input     allows you to write or paste multiple lines without ending each in '\\'\n");
    fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
    fprintf(stderr, "                        run in interactive mode and poll user input upon seeing PROMPT (can be\n");
    fprintf(stderr, "                        specified more than once for multiple prompts).\n");
@ -355,9 +366,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
    fprintf(stderr, "                        prompt to start generation with (default: empty)\n");
+    fprintf(stderr, "  -e                    process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
    fprintf(stderr, "  --session FNAME       file to cache model state in (may be large!) (default: none)\n");
    fprintf(stderr, "  --random-prompt       start with a randomized prompt.\n");
    fprintf(stderr, "  --in-prefix STRING    string to prefix user inputs with (default: empty)\n");
+    fprintf(stderr, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    fprintf(stderr, "  -f FNAME, --file FNAME\n");
    fprintf(stderr, "                        prompt file to start generation.\n");
    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
@ -424,8 +437,8 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
 // TODO: not great allocating this every time
 std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int)add_bos);
-    int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
+    std::vector<llama_token> res(text.size() + (int) add_bos);
+    const int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
    assert(n >= 0);
    res.resize(n);

@ -465,54 +478,340 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
    return lctx;
 }

-/* Keep track of current color of output, and emit ANSI code if it changes. */
-void set_console_color(console_state & con_st, console_color_t color) {
-    if (con_st.use_color && con_st.color != color) {
-        switch(color) {
-            case CONSOLE_COLOR_DEFAULT:
-                printf(ANSI_COLOR_RESET);
-                break;
-            case CONSOLE_COLOR_PROMPT:
-                printf(ANSI_COLOR_YELLOW);
-                break;
-            case CONSOLE_COLOR_USER_INPUT:
-                printf(ANSI_BOLD ANSI_COLOR_GREEN);
-                break;
-        }
-        con_st.color = color;
-    }
-}
-
-#if defined (_WIN32)
-void win32_console_init(bool enable_color) {
-    unsigned long dwMode = 0;
-    void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11)
-    if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) {
-        hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12)
-        if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) {
-            hConOut = 0;
+void console_init(console_state & con_st) {
+#if defined(_WIN32)
+    // Windows-specific console initialization
+    DWORD dwMode = 0;
+    con_st.hConsole = GetStdHandle(STD_OUTPUT_HANDLE);
+    if (con_st.hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(con_st.hConsole, &dwMode)) {
+        con_st.hConsole = GetStdHandle(STD_ERROR_HANDLE);
+        if (con_st.hConsole != INVALID_HANDLE_VALUE && (!GetConsoleMode(con_st.hConsole, &dwMode))) {
+            con_st.hConsole = NULL;
        }
    }
-    if (hConOut) {
+    if (con_st.hConsole) {
        // Enable ANSI colors on Windows 10+
-        if (enable_color && !(dwMode & 0x4)) {
-            SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4)
+        if (con_st.use_color && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
+            SetConsoleMode(con_st.hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
        }
        // Set console output codepage to UTF8
        SetConsoleOutputCP(CP_UTF8);
    }
-    void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10)
-    if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) {
+    HANDLE hConIn = GetStdHandle(STD_INPUT_HANDLE);
+    if (hConIn != INVALID_HANDLE_VALUE && GetConsoleMode(hConIn, &dwMode)) {
        // Set console input codepage to UTF16
        _setmode(_fileno(stdin), _O_WTEXT);
+
+        // Turn off ICANON (ENABLE_LINE_INPUT) and ECHO (ENABLE_ECHO_INPUT)
+        dwMode &= ~(ENABLE_LINE_INPUT | ENABLE_ECHO_INPUT);
+        SetConsoleMode(hConIn, dwMode);
+    }
+#else
+    // POSIX-specific console initialization
+    struct termios new_termios;
+    tcgetattr(STDIN_FILENO, &con_st.prev_state);
+    new_termios = con_st.prev_state;
+    new_termios.c_lflag &= ~(ICANON | ECHO);
+    new_termios.c_cc[VMIN] = 1;
+    new_termios.c_cc[VTIME] = 0;
+    tcsetattr(STDIN_FILENO, TCSANOW, &new_termios);
+
+    con_st.tty = fopen("/dev/tty", "w+");
+    if (con_st.tty != nullptr) {
+        con_st.out = con_st.tty;
+    }
+
+    setlocale(LC_ALL, "");
+#endif
+}
+
+void console_cleanup(console_state & con_st) {
+    // Reset console color
+    console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
+
+#if !defined(_WIN32)
+    if (con_st.tty != nullptr) {
+        con_st.out = stdout;
+        fclose(con_st.tty);
+        con_st.tty = nullptr;
+    }
+    // Restore the terminal settings on POSIX systems
+    tcsetattr(STDIN_FILENO, TCSANOW, &con_st.prev_state);
+#endif
+}
+
+/* Keep track of current color of output, and emit ANSI code if it changes. */
+void console_set_color(console_state & con_st, console_color_t color) {
+    if (con_st.use_color && con_st.color != color) {
+        fflush(stdout);
+        switch(color) {
+            case CONSOLE_COLOR_DEFAULT:
+                fprintf(con_st.out, ANSI_COLOR_RESET);
+                break;
+            case CONSOLE_COLOR_PROMPT:
+                fprintf(con_st.out, ANSI_COLOR_YELLOW);
+                break;
+            case CONSOLE_COLOR_USER_INPUT:
+                fprintf(con_st.out, ANSI_BOLD ANSI_COLOR_GREEN);
+                break;
+        }
+        con_st.color = color;
+        fflush(con_st.out);
    }
 }

-// Convert a wide Unicode string to an UTF8 string
-void win32_utf8_encode(const std::wstring & wstr, std::string & str) {
-    int size_needed = WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), NULL, 0, NULL, NULL);
-    std::string strTo(size_needed, 0);
-    WideCharToMultiByte(CP_UTF8, 0, &wstr[0], (int)wstr.size(), &strTo[0], size_needed, NULL, NULL);
-    str = strTo;
-}
+char32_t getchar32() {
+    wchar_t wc = getwchar();
+    if (static_cast<wint_t>(wc) == WEOF) {
+        return WEOF;
+    }
+
+#if WCHAR_MAX == 0xFFFF
+    if ((wc >= 0xD800) && (wc <= 0xDBFF)) { // Check if wc is a high surrogate
+        wchar_t low_surrogate = getwchar();
+        if ((low_surrogate >= 0xDC00) && (low_surrogate <= 0xDFFF)) { // Check if the next wchar is a low surrogate
+            return (static_cast<char32_t>(wc & 0x03FF) << 10) + (low_surrogate & 0x03FF) + 0x10000;
+        }
+    }
+    if ((wc >= 0xD800) && (wc <= 0xDFFF)) { // Invalid surrogate pair
+        return 0xFFFD; // Return the replacement character U+FFFD
+    }
 #endif
+
+    return static_cast<char32_t>(wc);
+}
+
+void pop_cursor(console_state & con_st) {
+#if defined(_WIN32)
+    if (con_st.hConsole != NULL) {
+        CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+        GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo);
+
+        COORD newCursorPosition = bufferInfo.dwCursorPosition;
+        if (newCursorPosition.X == 0) {
+            newCursorPosition.X = bufferInfo.dwSize.X - 1;
+            newCursorPosition.Y -= 1;
+        } else {
+            newCursorPosition.X -= 1;
+        }
+
+        SetConsoleCursorPosition(con_st.hConsole, newCursorPosition);
+        return;
+    }
+#endif
+    putc('\b', con_st.out);
+}
+
+int estimateWidth(char32_t codepoint) {
+#if defined(_WIN32)
+    return 1;
+#else
+    return wcwidth(codepoint);
+#endif
+}
+
+int put_codepoint(console_state & con_st, const char* utf8_codepoint, size_t length, int expectedWidth) {
+#if defined(_WIN32)
+    CONSOLE_SCREEN_BUFFER_INFO bufferInfo;
+    if (!GetConsoleScreenBufferInfo(con_st.hConsole, &bufferInfo)) {
+        // go with the default
+        return expectedWidth;
+    }
+    COORD initialPosition = bufferInfo.dwCursorPosition;
+    DWORD nNumberOfChars = length;
+    WriteConsole(con_st.hConsole, utf8_codepoint, nNumberOfChars, &nNumberOfChars, NULL);
+
+    CONSOLE_SCREEN_BUFFER_INFO newBufferInfo;
+    GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
+
+    // Figure out our real position if we're in the last column
+    if (utf8_codepoint[0] != 0x09 && initialPosition.X == newBufferInfo.dwSize.X - 1) {
+        DWORD nNumberOfChars;
+        WriteConsole(con_st.hConsole, &" \b", 2, &nNumberOfChars, NULL);
+        GetConsoleScreenBufferInfo(con_st.hConsole, &newBufferInfo);
+    }
+
+    int width = newBufferInfo.dwCursorPosition.X - initialPosition.X;
+    if (width < 0) {
+        width += newBufferInfo.dwSize.X;
+    }
+    return width;
+#else
+    // we can trust expectedWidth if we've got one
+    if (expectedWidth >= 0 || con_st.tty == nullptr) {
+        fwrite(utf8_codepoint, length, 1, con_st.out);
+        return expectedWidth;
+    }
+
+    fputs("\033[6n", con_st.tty); // Query cursor position
+    int x1, x2, y1, y2;
+    int results = 0;
+    results = fscanf(con_st.tty, "\033[%d;%dR", &y1, &x1);
+
+    fwrite(utf8_codepoint, length, 1, con_st.tty);
+
+    fputs("\033[6n", con_st.tty); // Query cursor position
+    results += fscanf(con_st.tty, "\033[%d;%dR", &y2, &x2);
+
+    if (results != 4) {
+        return expectedWidth;
+    }
+
+    int width = x2 - x1;
+    if (width < 0) {
+        // Calculate the width considering text wrapping
+        struct winsize w;
+        ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
+        width += w.ws_col;
+    }
+    return width;
+#endif
+}
+
+void replace_last(console_state & con_st, char ch) {
+#if defined(_WIN32)
+    pop_cursor(con_st);
+    put_codepoint(con_st, &ch, 1, 1);
+#else
+    fprintf(con_st.out, "\b%c", ch);
+#endif
+}
+
+void append_utf8(char32_t ch, std::string & out) {
+    if (ch <= 0x7F) {
+        out.push_back(static_cast<unsigned char>(ch));
+    } else if (ch <= 0x7FF) {
+        out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
+        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+    } else if (ch <= 0xFFFF) {
+        out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
+        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+    } else if (ch <= 0x10FFFF) {
+        out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
+        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
+        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+    } else {
+        // Invalid Unicode code point
+    }
+}
+
+// Helper function to remove the last UTF-8 character from a string
+void pop_back_utf8_char(std::string & line) {
+    if (line.empty()) {
+        return;
+    }
+
+    size_t pos = line.length() - 1;
+
+    // Find the start of the last UTF-8 character (checking up to 4 bytes back)
+    for (size_t i = 0; i < 3 && pos > 0; ++i, --pos) {
+        if ((line[pos] & 0xC0) != 0x80) break; // Found the start of the character
+    }
+    line.erase(pos);
+}
+
+bool console_readline(console_state & con_st, std::string & line) {
+    console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+    if (con_st.out != stdout) {
+        fflush(stdout);
+    }
+
+    line.clear();
+    std::vector<int> widths;
+    bool is_special_char = false;
+    bool end_of_stream = false;
+
+    char32_t input_char;
+    while (true) {
+        fflush(con_st.out); // Ensure all output is displayed before waiting for input
+        input_char = getchar32();
+
+        if (input_char == '\r' || input_char == '\n') {
+            break;
+        }
+
+        if (input_char == WEOF || input_char == 0x04 /* Ctrl+D*/) {
+            end_of_stream = true;
+            break;
+        }
+
+        if (is_special_char) {
+            console_set_color(con_st, CONSOLE_COLOR_USER_INPUT);
+            replace_last(con_st, line.back());
+            is_special_char = false;
+        }
+
+        if (input_char == '\033') { // Escape sequence
+            char32_t code = getchar32();
+            if (code == '[' || code == 0x1B) {
+                // Discard the rest of the escape sequence
+                while ((code = getchar32()) != WEOF) {
+                    if ((code >= 'A' && code <= 'Z') || (code >= 'a' && code <= 'z') || code == '~') {
+                        break;
+                    }
+                }
+            }
+        } else if (input_char == 0x08 || input_char == 0x7F) { // Backspace
+            if (!widths.empty()) {
+                int count;
+                do {
+                    count = widths.back();
+                    widths.pop_back();
+                    // Move cursor back, print space, and move cursor back again
+                    for (int i = 0; i < count; i++) {
+                        replace_last(con_st, ' ');
+                        pop_cursor(con_st);
+                    }
+                    pop_back_utf8_char(line);
+                } while (count == 0 && !widths.empty());
+            }
+        } else {
+            int offset = line.length();
+            append_utf8(input_char, line);
+            int width = put_codepoint(con_st, line.c_str() + offset, line.length() - offset, estimateWidth(input_char));
+            if (width < 0) {
+                width = 0;
+            }
+            widths.push_back(width);
+        }
+
+        if (!line.empty() && (line.back() == '\\' || line.back() == '/')) {
+            console_set_color(con_st, CONSOLE_COLOR_PROMPT);
+            replace_last(con_st, line.back());
+            is_special_char = true;
+        }
+    }
+
+    bool has_more = con_st.multiline_input;
+    if (is_special_char) {
+        replace_last(con_st, ' ');
+        pop_cursor(con_st);
+
+        char last = line.back();
+        line.pop_back();
+        if (last == '\\') {
+            line += '\n';
+            fputc('\n', con_st.out);
+            has_more = !has_more;
+        } else {
+            // llama will just eat the single space, it won't act as a space
+            if (line.length() == 1 && line.back() == ' ') {
+                line.clear();
+                pop_cursor(con_st);
+            }
+            has_more = false;
+        }
+    } else {
+        if (end_of_stream) {
+            has_more = false;
+        } else {
+            line += '\n';
+            fputc('\n', con_st.out);
+        }
+    }
+
+    fflush(con_st.out);
+    return has_more;
+}
--- a/examples/common.h
+++ b/examples/common.h
@ -10,6 +10,11 @@
 #include <thread>
 #include <unordered_map>

+#if !defined (_WIN32)
+#include <stdio.h>
+#include <termios.h>
+#endif
+
 //
 // CLI argument parsing
 //
@ -43,6 +48,7 @@ struct gpt_params {
    std::string prompt = "";
    std::string path_session = "";       // path to file for saving/loading model eval state
    std::string input_prefix = "";       // string to prefix user inputs with
+    std::string input_suffix = "";       // string to suffix user inputs with
    std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted

    std::string lora_adapter = "";  // lora adapter path
@ -55,6 +61,7 @@ struct gpt_params {

    bool embedding         = false; // get only sentence embedding
    bool interactive_first = false; // wait for user input immediately
+    bool multiline_input   = false; // reverse the usage of `\`

    bool instruct          = false; // instruction mode (used for Alpaca models)
    bool penalize_nl       = true;  // consider newlines as a repeatable token
@ -103,13 +110,20 @@ enum console_color_t {
 };

 struct console_state {
+    bool multiline_input = false;
    bool use_color = false;
    console_color_t color = CONSOLE_COLOR_DEFAULT;
+
+    FILE* out = stdout;
+#if defined (_WIN32)
+    void* hConsole;
+#else
+    FILE* tty = nullptr;
+    termios prev_state;
+#endif
 };

-void set_console_color(console_state & con_st, console_color_t color);
-
-#if defined (_WIN32)
-void win32_console_init(bool enable_color);
-void win32_utf8_encode(const std::wstring & wstr, std::string & str);
-#endif
+void console_init(console_state & con_st);
+void console_cleanup(console_state & con_st);
+void console_set_color(console_state & con_st, console_color_t color);
+bool console_readline(console_state & con_st, std::string & line);
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -17,23 +17,50 @@ This example program allows you to use various LLaMA language models in an easy

 To get started right away, run the following command, making sure to use the correct path for the model you have:

+#### Unix-based systems (Linux, macOS, etc.):
+
 ```bash
 ./main -m models/7B/ggml-model.bin --prompt "Once upon a time"
 ```

-The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
+#### Windows:

-```bash
-./main -m models/7B/ggml-model.bin --ignore-eos --n_predict -1 --keep -1 --prompt "Once upon a time"
+```powershell
+main.exe -m models\7B\ggml-model.bin --prompt "Once upon a time"
 ```

 For an interactive experience, try this command:

+#### Unix-based systems (Linux, macOS, etc.):
+
 ```bash
-./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " --prompt $'User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:'
+./main -m models/7B/ggml-model.bin -n -1 --color -r "User:" --in-prefix " " \
+'User: Hi
+AI: Hello. I am an AI chatbot. Would you like to talk?
+User: Sure!
+AI: What would you like to talk about?
+User:'
 ```

-Note that the newline characters in the prompt string above only work on Linux. On Windows, you will have to use the ``--file`` option (see below) to load a multi-line prompt from file instead.
+#### Windows:
+
+```powershell
+main.exe -m models\7B\ggml-model.bin -n -1 --color -r "User:" --in-prefix " " -e --prompt "User: Hi\nAI: Hello. I am an AI chatbot. Would you like to talk?\nUser: Sure!\nAI: What would you like to talk about?\nUser:"
+```
+
+The following command generates "infinite" text from a starting prompt (you can use `Ctrl-C` to stop it):
+
+#### Unix-based systems (Linux, macOS, etc.):
+
+```bash
+./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
+```
+
+#### Windows:
+
+```powershell
+main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
+```

 ## Common Options

@ -42,7 +69,6 @@ In this section, we cover the most commonly used options for running the `main`
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
-   `-t N, --threads N`: Set the number of threads to use during computation. It is recommended to set this to the number of physical cores your CPU has.
 -   `-n N, --n_predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
 -   `-c N, --ctx_size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.

@ -86,13 +112,21 @@ The `--in-prefix` flag is used to add a prefix to your input, primarily, this is
 ./main -r "User:" --in-prefix " "
 ```

+### In-Suffix
+
+The `--in-suffix` flag is used to add a suffix after your input. This is useful for adding an "Assistant:" prompt after the user's input. It's added after the new-line character (`\n`) that's automatically added to the end of the user's input. Here's an example of how to use the `--in-suffix` flag in conjunction with the `--reverse-prompt` flag:
+
+```sh
+./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
+```
+
 ### Instruction Mode

 Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:

 -   `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.

-Technical detail: the user's input is internally prefixed with the reverse prompt (or ``### Instruction:`` as the default), and followed by ``### Response:`` (except if you just press Return without any input, to keep generating a longer response).
+Technical detail: the user's input is internally prefixed with the reverse prompt (or `### Instruction:` as the default), and followed by `### Response:` (except if you just press Return without any input, to keep generating a longer response).

 By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.

@ -116,7 +150,7 @@ By utilizing context management options like `--ctx_size` and `--keep`, you can

 ## Generation Flags

-The following options are related to controlling the text generation process, influencing the diversity, creativity, and quality of the generated text. Understanding these options will help you fine-tune the output according to your needs:
+The following options allow you to control the text generation process and fine-tune the diversity, creativity, and quality of the generated text according to your needs. By adjusting these options and experimenting with different combinations of values, you can find the best settings for your specific use case.

 ### Number of Tokens to Predict

@ -124,13 +158,7 @@ The following options are related to controlling the text generation process, in

 The `--n_predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.

-It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the ``--ignore-eos`` parameter.
-
-### RNG Seed
-
-   `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1).
-
-The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.
+It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n_predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.

 ### Temperature

@ -138,15 +166,21 @@ The RNG seed is used to initialize the random number generator that influences t

 Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.

-Example usage: `--temp 0.8`
+Example usage: `--temp 0.5`

 ### Repeat Penalty

 -   `--repeat_penalty N`: Control the repetition of token sequences in the generated text (default: 1.1).
+-   `--repeat_last_n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx_size).
+-   `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty.

-Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.
+The `repeat_penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1.1.

-Example usage: `--repeat_penalty 1.1`
+The `repeat_last_n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx_size`).
+
+Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases.
+
+Example usage: `--repeat_penalty 1.15 --repeat_last_n 128 --no-penalize-nl`

 ### Top-K Sampling

@ -154,7 +188,7 @@ Example usage: `--repeat_penalty 1.1`

 Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text. The default value is 40.

-Example usage: `--top_k 40`
+Example usage: `--top_k 30`

 ### Top-P Sampling

@ -162,23 +196,87 @@ Example usage: `--top_k 40`

 Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. The default value is 0.9.

-Example usage: `--top_p 0.9`
+Example usage: `--top_p 0.95`

-By adjusting these options, you can control the diversity, quality, and creativity of the generated text to better suit your needs. You can experiment with different combinations of values to find the best settings for your specific use case.
+### Tail Free Sampling (TFS)
+
+-   `--tfs N`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled).
+
+Tail free sampling (TFS) is a text generation technique that aims to reduce the impact of less likely tokens, which may be less relevant, less coherent, or nonsensical, on the output. The method adjusts the logits (token probabilities) by raising them to the power of the parameter z. A higher value of z (e.g., 2.0) will further suppress less likely tokens from the tail of the distribution, while a value of 1.0 disables the effect of TFS. By setting the parameter z, you can control how much the probabilities of less likely tokens are reduced.
+
+Example usage: `--tfs 2.0`
+
+### Locally Typical Sampling
+
+-   `--typical N`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled).
+
+Locally typical sampling promotes the generation of contextually coherent and diverse text by sampling tokens that are typical or expected based on the surrounding context. By setting the parameter p between 0 and 1, you can control the balance between producing text that is locally coherent and diverse. A value closer to 1 will promote more contextually coherent tokens, while a value closer to 0 will promote more diverse tokens. A value equal to 1 disables locally typical sampling.
+
+Example usage: `--typical 0.9`
+
+### Mirostat Sampling
+
+-   `--mirostat N`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0).
+-   `--mirostat_lr N`: Set the Mirostat learning rate, parameter eta (default: 0.1).
+-   `--mirostat_ent N`: Set the Mirostat target entropy, parameter tau (default: 5.0).
+
+Mirostat is an algorithm that actively maintains the quality of generated text within a desired range during text generation. It aims to strike a balance between coherence and diversity, avoiding low-quality output caused by excessive repetition (boredom traps) or incoherence (confusion traps).
+
+The `--mirostat_lr` option sets the Mirostat learning rate (eta). The learning rate influences how quickly the algorithm responds to feedback from the generated text. A lower learning rate will result in slower adjustments, while a higher learning rate will make the algorithm more responsive. The default value is `0.1`.
+
+The `--mirostat_ent` option sets the Mirostat target entropy (tau), which represents the desired perplexity value for the generated text. Adjusting the target entropy allows you to control the balance between coherence and diversity in the generated text. A lower value will result in more focused and coherent text, while a higher value will lead to more diverse and potentially less coherent text. The default value is `5.0`.
+
+Example usage: `--mirostat 2 --mirostat_lr 0.05 --mirostat_ent 3.0`
+
+### Logit Bias
+
+-   `-l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS`: Modify the likelihood of a token appearing in the generated text completion.
+
+The logit bias option allows you to manually adjust the likelihood of specific tokens appearing in the generated text. By providing a token ID and a positive or negative bias value, you can increase or decrease the probability of that token being generated.
+
+For example, use `--logit-bias 15043+1` to increase the likelihood of the token 'Hello', or `--logit-bias 15043-1` to decrease its likelihood. Using a value of negative infinity, `--logit-bias 15043-inf` ensures that the token `Hello` is never produced.
+
+A more practical use case might be to prevent the generation of `\code{begin}` and `\code{end}` by setting the `\` token (29905) to negative infinity with `-l 29905-inf`. (This is due to the prevalence of LaTeX codes that show up in LLaMA model inference.)
+
+Example usage: `--logit-bias 29905-inf`
+
+### RNG Seed
+
+-   `-s SEED, --seed SEED`: Set the random number generator (RNG) seed (default: -1, < 0 = random seed).
+
+The RNG seed is used to initialize the random number generator that influences the text generation process. By setting a specific seed value, you can obtain consistent and reproducible results across multiple runs with the same input and settings. This can be helpful for testing, debugging, or comparing the effects of different options on the generated text to see when they diverge. If the seed is set to a value less than 0, a random seed will be used, which will result in different outputs on each run.

 ## Performance Tuning and Memory Options

-These options help improve the performance and memory usage of the LLaMA models:
+These options help improve the performance and memory usage of the LLaMA models. By adjusting these settings, you can fine-tune the model's behavior to better suit your system's capabilities and achieve optimal performance for your specific use case.
+
+### Number of Threads
+
+-   `-t N, --threads N`: Set the number of threads to use during computation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has (as opposed to the logical number of cores). Using the correct number of threads can greatly improve performance.
+
+### Mlock
+
+-   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. This can improve performance but trades away some of the advantages of memory-mapping by requiring more RAM to run and potentially slowing down load times as the model loads into RAM.
+
+### No Memory Mapping
+
+-   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. However, if the model is larger than your total amount of RAM or if your system is low on available memory, using mmap might increase the risk of pageouts, negatively impacting performance. Disabling mmap results in slower load times but may reduce pageouts if you're not using `--mlock`. Note that if the model is larger than the total amount of RAM, turning off mmap would prevent the model from loading at all.
+
+### Memory Float 32
+
+-   `--memory_f32`: Use 32-bit floats instead of 16-bit floats for memory key+value, allowing higher quality inference at the cost of higher memory usage.
+
+### Batch Size

-   `-t N, --threads N`: Set the number of threads to use during computation. Using the correct number of threads can greatly improve performance. It is recommended to set this value to the number of CPU cores.
-   `--mlock`: Lock the model in memory, preventing it from being swapped out when mmaped. This can improve performance.
-   `--no-mmap`: Do not memory-map the model. This results in a slower load time but may reduce pageouts if you're not using `mlock`.
-   `--memory_f32`: Use 32 bit floats instead of 16 bit floats for memory key+value, allowing higher quality inference at the cost of memory.
 -   `-b N, --batch_size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations.

-For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).
+### Session Caching

-By understanding and using these performance tuning settings, you can optimize the LLaMA model's behavior to achieve the best performance for your specific needs.
+-   `--session FNAME`: Specify a file to load/save the session, which caches the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The session file is created during the first run and is reused in subsequent runs. If you change your prompt such that 75% or less of the session is reusable, the existing session file will be overwritten with a new, updated version to maintain optimal performance.
+
+### Quantization
+
+For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run).

 ## Additional Options

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -35,12 +35,12 @@ static bool is_interacting = false;

 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
 void sigint_handler(int signo) {
-    set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-    printf("\n"); // this also force flush stdout.
    if (signo == SIGINT) {
        if (!is_interacting) {
            is_interacting=true;
        } else {
+            console_cleanup(con_st);
+            printf("\n");
            llama_print_timings(*g_ctx);
            _exit(130);
        }
@ -59,10 +59,9 @@ int main(int argc, char ** argv) {
    // save choice to use color for later
    // (note for later: this is a slightly awkward choice)
    con_st.use_color = params.use_color;
-
-#if defined (_WIN32)
-    win32_console_init(params.use_color);
-#endif
+    con_st.multiline_input = params.multiline_input;
+    console_init(con_st);
+    atexit([]() { console_cleanup(con_st); });

    if (params.perplexity) {
        printf("\n************\n");
@ -260,6 +259,10 @@ int main(int argc, char ** argv) {
        if (!params.input_prefix.empty()) {
            fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str());
        }
+
+        if (!params.input_suffix.empty()) {
+            fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
+        }
    }
    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
            params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
@ -271,12 +274,21 @@ int main(int argc, char ** argv) {
    std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);

    if (params.interactive) {
+        const char *control_message;
+        if (con_st.multiline_input) {
+            control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n";
+        } else {
+            control_message = " - Press Return to return control to LLaMa.\n"
+                              " - To return control without starting a new line, end your input with '/'.\n"
+                              " - If you want to submit another line, end your input with '\\'.\n";
+        }
        fprintf(stderr, "== Running in interactive mode. ==\n"
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
               " - Press Ctrl+C to interject at any time.\n"
 #endif
-               " - Press Return to return control to LLaMa.\n"
-               " - If you want to submit another line, end your input in '\\'.\n\n");
+               "%s\n", control_message);
+
        is_interacting = params.interactive_first;
    }

@ -295,7 +307,7 @@ int main(int argc, char ** argv) {
    int n_session_consumed = 0;

    // the first thing we will do is to output the prompt, so set color accordingly
-    set_console_color(con_st, CONSOLE_COLOR_PROMPT);
+    console_set_color(con_st, CONSOLE_COLOR_PROMPT);

    std::vector<llama_token> embd;

@ -309,7 +321,8 @@ int main(int argc, char ** argv) {
            if (n_past + (int) embd.size() > n_ctx) {
                const int n_left = n_past - params.n_keep;

-                n_past = params.n_keep;
+                // always keep the first token - BOS
+                n_past = std::max(1, params.n_keep);

                // insert n_left/2 tokens at the start of embd from last_n_tokens
                embd.insert(embd.begin(), last_n_tokens.begin() + n_ctx - n_left/2 - embd.size(), last_n_tokens.end() - embd.size());
@ -327,7 +340,6 @@ int main(int argc, char ** argv) {
            }

            // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
-            // REVIEW
            if (n_session_consumed < (int) session_tokens.size()) {
                size_t i = 0;
                for ( ; i < embd.size(); i++) {
@ -440,10 +452,10 @@ int main(int argc, char ** argv) {
                        id = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu);
                    } else {
                        // Temperature sampling
-                        llama_sample_top_k(ctx, &candidates_p, top_k);
-                        llama_sample_tail_free(ctx, &candidates_p, tfs_z);
-                        llama_sample_typical(ctx, &candidates_p, typical_p);
-                        llama_sample_top_p(ctx, &candidates_p, top_p);
+                        llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+                        llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
+                        llama_sample_typical(ctx, &candidates_p, typical_p, 1);
+                        llama_sample_top_p(ctx, &candidates_p, top_p, 1);
                        llama_sample_temperature(ctx, &candidates_p, temp);
                        id = llama_sample_token(ctx, &candidates_p);
                    }
@ -494,7 +506,7 @@ int main(int argc, char ** argv) {
        }
        // reset color to default if we there is no pending user input
        if (input_echo && (int)embd_inp.size() == n_consumed) {
-            set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
+            console_set_color(con_st, CONSOLE_COLOR_DEFAULT);
        }

        // in interactive mode, and not currently processing queued inputs;
@ -514,17 +526,12 @@ int main(int argc, char ** argv) {
                    if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
                        is_interacting = true;
                        is_antiprompt = true;
-                        set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
-                        fflush(stdout);
                        break;
                    }
                }
            }

            if (n_past > 0 && is_interacting) {
-                // potentially set color to indicate we are taking user input
-                set_console_color(con_st, CONSOLE_COLOR_USER_INPUT);
-
                if (params.instruct) {
                    printf("\n> ");
                }
@ -538,33 +545,21 @@ int main(int argc, char ** argv) {
                std::string line;
                bool another_line = true;
                do {
-#if defined(_WIN32)
-                    std::wstring wline;
-                    if (!std::getline(std::wcin, wline)) {
-                        // input stream is bad or EOF received
-                        return 0;
-                    }
-                    win32_utf8_encode(wline, line);
-#else
-                    if (!std::getline(std::cin, line)) {
-                        // input stream is bad or EOF received
-                        return 0;
-                    }
-#endif
-                    if (line.empty() || line.back() != '\\') {
-                        another_line = false;
-                    } else {
-                        line.pop_back(); // Remove the continue character
-                    }
-                    buffer += line + '\n'; // Append the line to the result
+                    another_line = console_readline(con_st, line);
+                    buffer += line;
                } while (another_line);

                // done taking input, reset color
-                set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
+                console_set_color(con_st, CONSOLE_COLOR_DEFAULT);

                // Add tokens to embd only if the input buffer is non-empty
                // Entering a empty line lets the user pass control back
                if (buffer.length() > 1) {
+                    // append input suffix if any
+                    if (!params.input_suffix.empty()) {
+                        buffer += params.input_suffix;
+                        printf("%s", params.input_suffix.c_str());
+                    }

                    // instruct mode: insert instruction prefix
                    if (params.instruct && !is_antiprompt) {
@ -611,7 +606,5 @@ int main(int argc, char ** argv) {
    llama_print_timings(ctx);
    llama_free(ctx);

-    set_console_color(con_st, CONSOLE_COLOR_DEFAULT);
-
    return 0;
 }
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -25,46 +25,68 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
    // Download: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
    // Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
    // Output: `perplexity: 13.5106 [114/114]`
+    // BOS tokens will be added for each chunk before eval
    auto tokens = ::llama_tokenize(ctx, params.prompt, true);

-    int count = 0;
-    int seq_count = tokens.size() / params.n_ctx;
-    int n_vocab = llama_n_vocab(ctx);
+    int count   = 0;
+
+    const int n_chunk = tokens.size() / params.n_ctx;
+    const int n_vocab = llama_n_vocab(ctx);
+    const int n_batch = params.n_batch;

    double nll = 0.0;
-    fprintf(stderr, "%s : calculating perplexity over %d chunks, batch_size=%d\n", __func__, seq_count, params.n_batch);
+    fprintf(stderr, "%s: calculating perplexity over %d chunks, batch_size=%d\n", __func__, n_chunk, n_batch);

-    for (int i = 0; i < seq_count; ++i) {
-        int start = i * params.n_ctx;
-        int end = start + params.n_ctx;
+    for (int i = 0; i < n_chunk; ++i) {
+        const int start =     i * params.n_ctx;
+        const int end   = start + params.n_ctx;
+
+        const int num_batches = (params.n_ctx + n_batch - 1) / n_batch;

        std::vector<float> logits;
-        int num_batches = (params.n_ctx + params.n_batch - 1) / params.n_batch;
-        auto start_t = std::chrono::high_resolution_clock::now();
+
+        const auto t_start = std::chrono::high_resolution_clock::now();
+
        for (int j = 0; j < num_batches; ++j) {
-            int batch_start = start + j * params.n_batch;
-            int batch_size = std::min(end - batch_start, params.n_batch);
-            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * params.n_batch, params.n_threads)) {
+            const int batch_start = start + j * n_batch;
+            const int batch_size  = std::min(end - batch_start, n_batch);
+
+            // save original token and restore it after eval
+            const auto token_org = tokens[batch_start];
+
+            // add BOS token for the first batch of each chunk
+            if (j == 0) {
+                tokens[batch_start] = llama_token_bos();
+            }
+
+            if (llama_eval(ctx, tokens.data() + batch_start, batch_size, j * n_batch, params.n_threads)) {
                fprintf(stderr, "%s : failed to eval\n", __func__);
                return;
            }
-            auto batch_logits = llama_get_logits(ctx);
+
+            // restore the original token in case it was set to BOS
+            tokens[batch_start] = token_org;
+
+            const auto batch_logits = llama_get_logits(ctx);
            logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
        }
-        auto end_t = std::chrono::high_resolution_clock::now();
+
+        const auto t_end = std::chrono::high_resolution_clock::now();
+
        if (i == 0) {
-            const float seconds = std::chrono::duration<float>(end_t - start_t).count();
-            printf("%.2f seconds per pass - ETA ", seconds);
-            int total_seconds = (int)(seconds * seq_count);
+            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
+            fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total);
+            int total_seconds = (int)(t_total * n_chunk);
            if (total_seconds >= 60*60) {
-                printf("%d hours ", total_seconds / (60*60));
+                fprintf(stderr, "%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
-            printf("%d minutes\n", total_seconds / 60);
+            fprintf(stderr, "%d minutes\n", total_seconds / 60);
        }
+
        // We get the logits for all the tokens in the context window (params.n_ctx)
        // from llama_eval above.  Now, based on https://huggingface.co/docs/transformers/perplexity,
-        // calculate the perplexity over the last half the window (so the model always has
+        // calculate the perplexity over the last half of the window (so the model always has
        // some context to predict the token).
        //
        // We rely on the fact that attention in the forward pass only looks at previous
@ -76,10 +98,12 @@ void perplexity(llama_context * ctx, const gpt_params & params) {
        // process the entire prompt.
        for (int j = std::min(512, params.n_ctx / 2); j < params.n_ctx - 1; ++j) {
            // Calculate probability of next token, given the previous ones.
-            std::vector<float> tok_logits(
-                logits.begin() + j * n_vocab,
+            const std::vector<float> tok_logits(
+                logits.begin() + (j + 0) * n_vocab,
                logits.begin() + (j + 1) * n_vocab);
-            float prob = softmax(tok_logits)[tokens[start + j + 1]];
+
+            const float prob = softmax(tok_logits)[tokens[start + j + 1]];
+
            nll += -std::log(prob);
            ++count;
        }
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -6,23 +6,47 @@
 #include <map>
 #include <string>

-static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
-  {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
-  {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
-  {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
-  {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
-  {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
-  {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
+static const std::map<std::string, llama_ftype> LLAMA_FTYPE_MAP = {
+    {"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
+    {"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
+    {"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
+    {"q5_0", LLAMA_FTYPE_MOSTLY_Q5_0},
+    {"q5_1", LLAMA_FTYPE_MOSTLY_Q5_1},
+    {"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
 };

+bool try_parse_ftype(const std::string & ftype_str, llama_ftype & ftype, std::string & ftype_str_out) {
+    auto it = LLAMA_FTYPE_MAP.find(ftype_str);
+    if (it != LLAMA_FTYPE_MAP.end()) {
+        ftype = it->second;
+        ftype_str_out = it->first;
+        return true;
+    }
+    // try to parse as an integer
+    try {
+        int ftype_int = std::stoi(ftype_str);
+        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
+            if (it->second == ftype_int) {
+                ftype = it->second;
+                ftype_str_out = it->first;
+                return true;
+            }
+        }
+    }
+    catch (...) {
+        // stoi failed
+    }
+    return false;
+}
+
 // usage:
-//  ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
+//  ./quantize models/llama/ggml-model.bin [models/llama/ggml-model-quant.bin] type [nthreads]
 //
 int main(int argc, char ** argv) {
    ggml_time_init();

-    if (argc < 4) {
-        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
+    if (argc < 3) {
+        fprintf(stderr, "usage: %s model-f32.bin [model-quant.bin] type [nthreads]\n", argv[0]);
        for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
            fprintf(stderr, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
        }
@ -36,24 +60,62 @@ int main(int argc, char ** argv) {
        ggml_free(ctx);
    }

+    // parse command line arguments
    const std::string fname_inp = argv[1];
-    const std::string fname_out = argv[2];
+    std::string fname_out;
+    int nthread;
+    llama_ftype ftype;

-    enum llama_ftype ftype;
-    if (argv[3][0] == 'q') {
-        auto it = LLAMA_FTYPE_MAP.find(argv[3]);
-        if (it == LLAMA_FTYPE_MAP.end()) {
-            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
+    int arg_idx = 2;
+    std::string ftype_str;
+    if (try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
+        // argv[2] is the ftype
+        std::string fpath;
+        const size_t pos = fname_inp.find_last_of('/');
+        if (pos != std::string::npos) {
+            fpath = fname_inp.substr(0, pos + 1);
+        }
+        // export as [inp path]/ggml-model-[ftype].bin
+        fname_out = fpath + "ggml-model-" + ftype_str + ".bin";
+        arg_idx++;
+    }
+    else {
+        // argv[2] is the output path
+        fname_out = argv[arg_idx];
+        arg_idx++;
+
+        if (argc <= arg_idx) {
+            fprintf(stderr, "%s: missing ftype\n", __func__);
+            return 1;
+        }
+        // argv[3] is the ftype
+        if (!try_parse_ftype(argv[arg_idx], ftype, ftype_str)) {
+            fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
+            return 1;
+        }
+        arg_idx++;
+    }
+
+    // parse nthreads
+    if (argc > arg_idx) {
+        try {
+            nthread = std::stoi(argv[arg_idx]);
+        }
+        catch (const std::exception & e) {
+            fprintf(stderr, "%s: invalid nthread '%s' (%s)\n", __func__, argv[arg_idx], e.what());
            return 1;
        }
-        ftype = it->second;
    } else {
-        ftype = (enum llama_ftype)atoi(argv[3]);
+        nthread = 0;
    }

    fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);

-    int nthread = argc > 4 ? atoi(argv[4]) : 0;
+    fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
+    if (nthread > 0) {
+        fprintf(stderr, " using %d threads", nthread);
+    }
+    fprintf(stderr, "\n");

    const int64_t t_main_start_us = ggml_time_us();

--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -348,7 +348,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
    CUDA_CHECK(cudaFree(ptr));
 }

-#define GGML_CUDA_MAX_STREAMS 8
+#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
 #define GGML_CUDA_MAX_EVENTS 64
 static cublasHandle_t g_cublasH = nullptr;
 static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
--- a/ggml.c
+++ b/ggml.c
@ -137,6 +137,9 @@ inline static void* ggml_aligned_malloc(size_t size) {

 #if defined(GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
+#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
+#include "ggml-opencl.h"
+#endif
 #elif defined(GGML_USE_OPENBLAS)
 #include <cblas.h>
 #elif defined(GGML_USE_CUBLAS)
@ -180,9 +183,13 @@ typedef double ggml_float;
 #undef bool
 #define bool _Bool
 #else
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include <intrin.h>
+#else
 #include <immintrin.h>
 #endif
 #endif
+#endif

 #ifdef __F16C__

@ -1509,15 +1516,135 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
 }

 static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
+    assert(QK8_0 == 32);
    assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;

    block_q8_0 * restrict y = vy;

+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
+        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
+
+        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
+        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
+        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        for (int l = 0; l < 8; l++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = d;
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#else
+    // scalar
    quantize_row_q8_0_reference(x, y, k);
+#endif
 }

 // reference implementation for deterministic creation of model files
 static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
+    assert(QK8_1 == 32);
    assert(k % QK8_1 == 0);
    const int nb = k / QK8_1;

@ -11536,7 +11663,11 @@ typedef int ggml_lock_t;

 #define ggml_lock_init(x)    UNUSED(x)
 #define ggml_lock_destroy(x) UNUSED(x)
+#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))
+#define ggml_lock_lock(x)    _mm_pause()
+#else
 #define ggml_lock_lock(x)    UNUSED(x)
+#endif
 #define ggml_lock_unlock(x)  UNUSED(x)

 #define GGML_LOCK_INITIALIZER 0
--- a/llama-util.h
+++ b/llama-util.h
@ -14,6 +14,7 @@

 #include <string>
 #include <vector>
+#include <stdexcept>

 #ifdef __has_include
    #if __has_include(<unistd.h>)
@ -74,7 +75,7 @@ struct llama_file {
    llama_file(const char * fname, const char * mode) {
        fp = std::fopen(fname, mode);
        if (fp == NULL) {
-            throw format("failed to open %s: %s", fname, std::strerror(errno));
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
        }
        seek(0, SEEK_END);
        size = tell();
@ -107,10 +108,10 @@ struct llama_file {
        errno = 0;
        std::size_t ret = std::fread(ptr, size, 1, fp);
        if (ferror(fp)) {
-            throw format("read error: %s", strerror(errno));
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
        }
        if (ret != 1) {
-            throw std::string("unexpectedly reached end of file");
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
        }
    }

@ -133,7 +134,7 @@ struct llama_file {
        errno = 0;
        size_t ret = std::fwrite(ptr, size, 1, fp);
        if (ret != 1) {
-            throw format("write error: %s", strerror(errno));
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
        }
    }

@ -180,7 +181,7 @@ struct llama_mmap {
 #endif
        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
        if (addr == MAP_FAILED) {
-            throw format("mmap failed: %s", strerror(errno));
+            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
        }

        if (prefetch) {
@ -207,7 +208,7 @@ struct llama_mmap {
        DWORD error = GetLastError();

        if (hMapping == NULL) {
-            throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
+            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
        }

        addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
@ -215,7 +216,7 @@ struct llama_mmap {
        CloseHandle(hMapping);

        if (addr == NULL) {
-            throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
+            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
        }

        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
@ -245,7 +246,7 @@ struct llama_mmap {

    llama_mmap(struct llama_file *, bool prefetch = true) {
        (void)prefetch;
-        throw std::string("mmap not supported");
+        throw std::runtime_error(std::string("mmap not supported"));
    }
 #endif
 };
--- a/llama.cpp
+++ b/llama.cpp
@ -970,8 +970,6 @@ static void llama_model_load_internal(

    // prepare memory for the weights
    {
-        const auto & hparams = model.hparams;
-
        const uint32_t n_embd  = hparams.n_embd;
        const uint32_t n_layer = hparams.n_layer;
        const uint32_t n_vocab = hparams.n_vocab;
@ -1052,6 +1050,13 @@ static bool llama_eval_internal(
            const int   n_tokens,
            const int   n_past,
            const int   n_threads) {
+
+    // enforce that the first token is BOS
+    if (n_past == 0 && tokens[0] != llama_token_bos()) {
+        fprintf(stderr, "%s: first token must be BOS\n", __func__);
+        return false;
+    }
+
    const int64_t t_start_us = ggml_time_us();

    const int N = n_tokens;
@ -1482,7 +1487,7 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
    }

    if (bos) {
-        output.push_back(1);
+        output.push_back(llama_token_bos());
    }

    tokenizer.tokenize(text, output);
@ -1791,7 +1796,7 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
    float k = powf((epsilon_hat * powf(2, *mu)) / (1 - powf(N, -epsilon_hat)), 1 / s_hat);

    // Sample the next word X using top-k sampling
-    llama_sample_top_k(nullptr, candidates, int(k));
+    llama_sample_top_k(nullptr, candidates, int(k), 1);
    if (ctx) {
        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
    }
@ -2727,11 +2732,14 @@ int llama_eval(
        fprintf(stderr, "%s: failed to eval\n", __func__);
        return 1;
    }
+
    // get a more accurate load time, upon first eval
+    // TODO: fix this
    if (!ctx->has_evaluated_once) {
        ctx->t_load_us = ggml_time_us() - ctx->t_start_us;
        ctx->has_evaluated_once = true;
    }
+
    return 0;
 }

--- a/llama.h
+++ b/llama.h
@ -202,16 +202,16 @@ extern "C" {
    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);

    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
+    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);

    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);

    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
+    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);

    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);

    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
--- a/scripts/ppl-run-all.sh
+++ b/scripts/ppl-run-all.sh
@ -0,0 +1,43 @@
+#!/bin/bash
+
+#
+# quantize
+#
+
+# 7B
+time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-7b-q4_0.txt
+time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-7b-q4_1.txt
+time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-7b-q4_2.txt
+time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-7b-q5_0.txt
+time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-7b-q5_1.txt
+time ./bin/quantize ../models/7B/ggml-model-f16.bin ../models/7B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-7b-q8_0.txt
+
+# 13B
+time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_0.bin q4_0 2>&1 | tee ../qnt-13b-q4_0.txt
+time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_1.bin q4_1 2>&1 | tee ../qnt-13b-q4_1.txt
+time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q4_2.bin q4_2 2>&1 | tee ../qnt-13b-q4_2.txt
+time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_0.bin q5_0 2>&1 | tee ../qnt-13b-q5_0.txt
+time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q5_1.bin q5_1 2>&1 | tee ../qnt-13b-q5_1.txt
+time ./bin/quantize ../models/13B/ggml-model-f16.bin ../models/13B/ggml-model-q8_0.bin q8_0 2>&1 | tee ../qnt-13b-q8_0.txt
+
+#
+# perplexity
+#
+
+# 7B
+time ./bin/perplexity -m ../models/7B/ggml-model-f16.bin  -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-f16.txt
+time ./bin/perplexity -m ../models/7B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_0.txt
+time ./bin/perplexity -m ../models/7B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_1.txt
+time ./bin/perplexity -m ../models/7B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q4_2.txt
+time ./bin/perplexity -m ../models/7B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_0.txt
+time ./bin/perplexity -m ../models/7B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q5_1.txt
+time ./bin/perplexity -m ../models/7B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-7b-q8_0.txt
+
+# 13B
+time ./bin/perplexity -m ../models/13B/ggml-model-f16.bin  -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-f16.txt
+time ./bin/perplexity -m ../models/13B/ggml-model-q4_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_0.txt
+time ./bin/perplexity -m ../models/13B/ggml-model-q4_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_1.txt
+time ./bin/perplexity -m ../models/13B/ggml-model-q4_2.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q4_2.txt
+time ./bin/perplexity -m ../models/13B/ggml-model-q5_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_0.txt
+time ./bin/perplexity -m ../models/13B/ggml-model-q5_1.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q5_1.txt
+time ./bin/perplexity -m ../models/13B/ggml-model-q8_0.bin -f ./wiki.test.raw --no-mmap -t 12 2>&1 | tee ../ppl-13b-q8_0.txt
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@ -32,7 +32,7 @@ void test_top_k(const std::vector<float> & probs,
    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
    llama_sample_softmax(nullptr, &candidates_p);
    DUMP(&candidates_p);
-    llama_sample_top_k(nullptr, &candidates_p, k);
+    llama_sample_top_k(nullptr, &candidates_p, k, 1);
    DUMP(&candidates_p);

    assert(candidates_p.size == expected_probs.size());
@ -57,7 +57,7 @@ void test_top_p(const std::vector<float> & probs,
    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
    llama_sample_softmax(nullptr, &candidates_p);
    DUMP(&candidates_p);
-    llama_sample_top_p(nullptr, &candidates_p, p);
+    llama_sample_top_p(nullptr, &candidates_p, p, 1);
    DUMP(&candidates_p);

    assert(candidates_p.size == expected_probs.size());
@ -80,7 +80,7 @@ void test_tfs(const std::vector<float> & probs,

    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
    DUMP(&candidates_p);
-    llama_sample_tail_free(nullptr, &candidates_p, z);
+    llama_sample_tail_free(nullptr, &candidates_p, z, 1);
    DUMP(&candidates_p);

    assert(candidates_p.size == expected_probs.size());
@ -103,7 +103,7 @@ void test_typical(const std::vector<float> & probs,

    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
    DUMP(&candidates_p);
-    llama_sample_typical(nullptr, &candidates_p, p);
+    llama_sample_typical(nullptr, &candidates_p, p, 1);
    DUMP(&candidates_p);

    assert(candidates_p.size == expected_probs.size());