diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
new file mode 100644
index 000000000..618cdddc4
--- /dev/null
+++ b/.devops/full.Dockerfile
@@ -0,0 +1,17 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y build-essential python3 python3-pip
+
+RUN pip install --upgrade pip setuptools wheel \
+    && pip install torch torchvision torchaudio sentencepiece numpy
+
+WORKDIR /app
+
+COPY . .
+
+RUN make
+
+ENTRYPOINT ["/app/.devops/tools.sh"]
\ No newline at end of file
diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile
new file mode 100644
index 000000000..cd575efa0
--- /dev/null
+++ b/.devops/main.Dockerfile
@@ -0,0 +1,18 @@
+ARG UBUNTU_VERSION=22.04
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+RUN apt-get update && \
+    apt-get install -y build-essential
+
+WORKDIR /app
+
+COPY . .
+
+RUN make
+
+FROM ubuntu:$UBUNTU_VERSION as runtime
+
+COPY --from=build /app/main /main
+
+ENTRYPOINT [ "/main" ]
\ No newline at end of file
diff --git a/.devops/tools.sh b/.devops/tools.sh
new file mode 100755
index 000000000..352e04942
--- /dev/null
+++ b/.devops/tools.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+set -e
+
+# Read the first argument into a variable
+arg1="$1"
+
+# Shift the arguments to remove the first one
+shift
+
+# Join the remaining arguments into a single string
+arg2="$@"
+
+if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
+    python3 ./convert-pth-to-ggml.py $arg2
+elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
+    ./quantize $arg2
+elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
+    ./main $arg2
+elif [[ $arg1 == '--download' || $arg1 == '-d' ]]; then
+    python3 ./download-pth.py $arg2
+elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
+    echo "Downloading model..."
+    python3 ./download-pth.py "$1" "$2"
+    echo "Converting PTH to GGML..."
+    for i in `ls $1/$2/ggml-model-f16.bin*`; do
+        if [ -f "${i/f16/q4_0}" ]; then
+            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
+        else
+            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
+            ./quantize "$i" "${i/f16/q4_0}" 2
+        fi
+    done
+else
+    echo "Unknown command: $arg1"
+    echo "Available commands: "
+    echo "  --run (-r): Run a model previously converted into ggml"
+    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
+    echo "  --convert (-c): Convert a llama model into ggml"
+    echo "              ex: \"/models/7B/\" 1"
+    echo "  --quantize (-q): Optimize with quantization process ggml"
+    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
+    echo "  --download (-d): Download original llama model from CDN: https://agi.gpt4.org/llama/"
+    echo "              ex: \"/models/\" 7B"
+    echo "  --all-in-one (-a): Execute --download, --convert & --quantize"
+    echo "              ex: \"/models/\" 7B"
+fi
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 000000000..952990f26
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,24 @@
+*.o
+*.a
+.cache/
+.vs/
+.vscode/
+.DS_Store
+
+build/
+build-em/
+build-debug/
+build-release/
+build-static/
+build-no-accel/
+build-sanitize-addr/
+build-sanitize-thread/
+
+models/*
+
+/main
+/quantize
+
+arm_neon.h
+compile_commands.json
+Dockerfile
\ No newline at end of file
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 1a068ae75..9c1de5823 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,8 +1,42 @@
 name: CI
-on: [push, pull_request]
+
+on:
+  workflow_dispatch: # allows manual triggering
+    inputs:
+      create_release:
+        description: 'Create new release'
+        required: true
+        type: boolean
+  push:
+    paths: ['.github/workflows/**', 'CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
+  pull_request:
+    types: [opened, synchronize, edited, reopened, review_requested, ready_for_review]
+    paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
+
+env:
+ BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
 
 jobs:
-  ubuntu-latest:
+  ubuntu-latest-make:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+
+      - name: Build
+        id: make_build
+        run: |
+          make
+
+  ubuntu-latest-cmake:
     runs-on: ubuntu-latest
 
     steps:
@@ -15,10 +49,31 @@ jobs:
           sudo apt-get install build-essential
 
       - name: Build
+        run: |
+          mkdir build
+          cd build
+          cmake ..
+          cmake --build . --config Release
+
+  macOS-latest-make:
+    runs-on: macos-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        run: |
+          brew update
+
+      - name: Build
+        id: make_build
         run: |
           make
 
-  macOS-latest:
+  macOS-latest-cmake:
     runs-on: macOS-latest
 
     steps:
@@ -31,22 +86,59 @@ jobs:
 
       - name: Build
         run: |
-          make
+          mkdir build
+          cd build
+          cmake ..
+          cmake --build . --config Release
 
-  windows-latest:
+  windows-latest-cmake:
     runs-on: windows-latest
 
     steps:
       - name: Clone
+        id: checkout
         uses: actions/checkout@v1
 
       - name: Build
+        id: cmake_build
         run: |
           mkdir build
           cd build
           cmake ..
           cmake --build . --config Release
 
+      - name: Get commit hash
+        id: commit
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: pr-mpt/actions-commit-hash@v2
+
+      - name: Pack artifacts
+        id: pack_artifacts
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        run: |
+          7z a llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip .\build\Release\*
+
+      - name: Create release
+        id: create_release
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: zendesk/action-create-release@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          tag_name: ${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}
+
+      - name: Upload release
+        id: upload_release
+        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ steps.create_release.outputs.upload_url }} 
+          asset_path: .\llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
+          asset_name: llama-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-x64.zip
+          asset_content_type: application/octet-stream
+
 #  ubuntu-latest-gcc:
 #    runs-on: ubuntu-latest
 #
diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml
new file mode 100644
index 000000000..bc9aff7b7
--- /dev/null
+++ b/.github/workflows/docker.yml
@@ -0,0 +1,61 @@
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+# GitHub recommends pinning actions to a commit SHA.
+# To get a newer version, you will need to update the SHA.
+# You can also reference a tag or branch, but the action may change without warning.
+
+name: Publish Docker image
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+jobs:
+  push_to_registry:
+    name: Push Docker image to Docker Hub
+    runs-on: ubuntu-latest
+    env:
+      COMMIT_SHA: ${{ github.sha }}
+    strategy:
+      matrix:
+        config:
+          - { tag: "light", dockerfile: ".devops/main.Dockerfile" }
+          - { tag: "full", dockerfile: ".devops/full.Dockerfile" }
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v3
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v2
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push Docker image (versioned)
+        if: github.event_name == 'push'
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          push: true
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          file: ${{ matrix.config.dockerfile }}
+
+      - name: Build and push Docker image (tagged)
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          push: ${{ github.event_name == 'push' }}
+          tags: "ghcr.io/ggerganov/llama.cpp:${{ matrix.config.tag }}"
+          file: ${{ matrix.config.dockerfile }}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 5eb1ff1b8..3087b0ea5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,6 +18,10 @@ models/*
 
 /main
 /quantize
+/result
 
 arm_neon.h
 compile_commands.json
+
+.envrc
+.direnv/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ca3be38a5..38e7266dc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,6 +4,8 @@ project("llama.cpp")
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
+set(THREADS_PREFER_PTHREAD_FLAG ON)
+find_package(Threads REQUIRED)
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
@@ -126,3 +128,4 @@ target_link_libraries(ggml PRIVATE ${LLAMA_EXTRA_LIBS})
 target_include_directories(ggml PUBLIC .)
 target_link_libraries(quantize PRIVATE ggml)
 target_link_libraries(llama PRIVATE ggml)
+target_link_libraries(ggml PRIVATE Threads::Threads)
diff --git a/README.md b/README.md
index 15e1b9a2d..1fe5b5426 100644
--- a/README.md
+++ b/README.md
@@ -32,13 +32,14 @@ Supported platforms:
 - [X] Mac OS
 - [X] Linux
 - [X] Windows (via CMake)
+- [X] Docker
 
 ---
 
 Here is a typical run using LLaMA-7B:
 
 ```java
-make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -t 8 -n 512
+make -j && ./main -m ./models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
 I llama.cpp build info:
 I UNAME_S:  Darwin
 I UNAME_P:  arm
@@ -149,12 +150,24 @@ python3 convert-pth-to-ggml.py models/7B/ 1
 ./quantize.sh 7B
 
 # run the inference
-./main -m ./models/7B/ggml-model-q4_0.bin -t 8 -n 128
+./main -m ./models/7B/ggml-model-q4_0.bin -n 128
 ```
 
+Currently, it's best to use Python 3.9 or Python 3.10, as `sentencepiece` has not yet published a wheel for Python 3.11.
+
 When running the larger models, make sure you have enough disk space to store all the intermediate files.
 
-TODO: add model disk/mem requirements
+### Memory/Disk Requirements
+
+As the models are currently fully loaded into memory, you will need adequate disk space to save them
+and sufficient RAM to load them. At the moment, memory and disk requirements are the same.
+
+| model | original size | quantized size (4-bit) |
+|-------|---------------|------------------------|
+| 7B    | 13 GB         | 3.9 GB                 |
+| 13B   | 24 GB         | 7.8 GB                 |
+| 30B   | 60 GB         | 19.5 GB                |
+| 65B   | 120 GB        | 38.5 GB                |
 
 ### Interactive mode
 
@@ -163,7 +176,7 @@ In this mode, you can always interrupt generation by pressing Ctrl+C and enter o
 
 Here is an example few-shot interaction, invoked with the command
 ```
-./main -m ./models/13B/ggml-model-q4_0.bin -t 8 -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
+./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" \
                                            -p \
 "Transcript of a dialog, where the User interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.
 
@@ -194,6 +207,37 @@ Finally, copy the `llama` binary and the model files to your device storage. Her
 
 https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
 
+### Docker
+
+#### Prerequisites
+* Docker must be installed and running on your system.
+* Create a folder to store big models & intermediate files (in ex. im using /llama/models)
+
+#### Images
+We have two Docker images available for this project:
+
+1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
+2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file.
+
+#### Usage
+
+The easiest way to download the models, convert them to ggml and optimize them is with the --all-in-one command which includes the full docker image.
+
+ ```bash
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
+```
+
+On complete, you are ready to play!
+
+```bash
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+```
+
+or with light image:
+
+```bash
+docker run -v /llama/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
+```
 
 ## Limitations
 
@@ -210,6 +254,7 @@ https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b0
 - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
 - Collaborators will be invited based on contributions
 - Any help with managing issues and PRs is very appreciated!
+- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
 
 ### Coding guidelines
 
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
index 5c36e9c09..d0eb213c8 100644
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -16,7 +16,7 @@
 # At the start of the ggml file we write the model parameters
 # and vocabulary.
 #
-
+import os
 import sys
 import json
 import struct
@@ -64,6 +64,10 @@ if len(sys.argv) > 2:
         sys.exit(1)
     fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
 
+if os.path.exists(fname_out):
+    print(f"Skip conversion, it already exists: {fname_out}")
+    sys.exit(0)
+
 with open(fname_hparams, "r") as f:
     hparams = json.load(f)
 
diff --git a/download-pth.py b/download-pth.py
new file mode 100644
index 000000000..129532c0c
--- /dev/null
+++ b/download-pth.py
@@ -0,0 +1,66 @@
+import os
+import sys
+from tqdm import tqdm
+import requests
+
+if len(sys.argv) < 3:
+    print("Usage: download-pth.py dir-model model-type\n")
+    print("  model-type: Available models 7B, 13B, 30B or 65B")
+    sys.exit(1)
+
+modelsDir = sys.argv[1]
+model = sys.argv[2]
+
+num = {
+    "7B": 1,
+    "13B": 2,
+    "30B": 4,
+    "65B": 8,
+}
+
+if model not in num:
+    print(f"Error: model {model} is not valid, provide 7B, 13B, 30B or 65B")
+    sys.exit(1)
+
+print(f"Downloading model {model}")
+
+files = ["checklist.chk", "params.json"]
+
+for i in range(num[model]):
+    files.append(f"consolidated.0{i}.pth")
+
+resolved_path = os.path.abspath(os.path.join(modelsDir, model))
+os.makedirs(resolved_path, exist_ok=True)
+
+for file in files:
+    dest_path = os.path.join(resolved_path, file)
+    
+    if os.path.exists(dest_path):
+        print(f"Skip file download, it already exists: {file}")
+        continue
+
+    url = f"https://agi.gpt4.org/llama/LLaMA/{model}/{file}"
+    response = requests.get(url, stream=True)
+    with open(dest_path, 'wb') as f:
+        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+                    t.update(len(chunk))
+
+files2 = ["tokenizer_checklist.chk", "tokenizer.model"]
+for file in files2:
+    dest_path = os.path.join(modelsDir, file)
+    
+    if os.path.exists(dest_path):
+        print(f"Skip file download, it already exists: {file}")
+        continue
+    
+    url = f"https://agi.gpt4.org/llama/LLaMA/{file}"
+    response = requests.get(url, stream=True)
+    with open(dest_path, 'wb') as f:
+        with tqdm(unit='B', unit_scale=True, miniters=1, desc=file) as t:
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:
+                    f.write(chunk)
+                    t.update(len(chunk))
\ No newline at end of file
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 000000000..343996da1
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,43 @@
+{
+  "nodes": {
+    "flake-utils": {
+      "locked": {
+        "lastModified": 1676283394,
+        "narHash": "sha256-XX2f9c3iySLCw54rJ/CZs+ZK6IQy7GXNY4nSOyu2QG4=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "3db36a8b464d0c4532ba1c7dda728f4576d6d073",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1678470307,
+        "narHash": "sha256-OEeMUr3ueLIXyW/OaFUX5jUdimyQwMg/7e+/Q0gC/QE=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "0c4800d579af4ed98ecc47d464a5e7b0870c4b1f",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "flake-utils": "flake-utils",
+        "nixpkgs": "nixpkgs"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 000000000..dae4ff60f
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,48 @@
+{
+  inputs = {
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+    flake-utils.url = "github:numtide/flake-utils";
+  };
+  outputs = { self, nixpkgs, flake-utils }:
+    flake-utils.lib.eachDefaultSystem (system:
+      let
+        pkgs = import nixpkgs {
+          inherit system;
+        };
+        llama-python = pkgs.python310.withPackages (ps: with ps; [
+          torch
+          numpy
+          sentencepiece
+        ]);
+      in
+      {
+        packages.default = pkgs.stdenv.mkDerivation {
+          name = "llama.cpp";
+          src = ./.;
+          nativeBuildInputs = with pkgs; [ cmake ];
+          buildInputs = with pkgs; lib.optionals stdenv.isDarwin [
+            darwin.apple_sdk.frameworks.Accelerate
+          ];
+          cmakeFlags = with pkgs; lib.optionals (system == "aarch64-darwin") [
+            "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1"
+          ];
+          installPhase = ''
+            mkdir -p $out/bin
+            mv llama $out/bin/llama
+            mv quantize $out/bin/quantize
+            echo "#!${llama-python}/bin/python" > $out/bin/convert-pth-to-ggml
+            cat ${./convert-pth-to-ggml.py} >> $out/bin/convert-pth-to-ggml
+            chmod +x $out/bin/convert-pth-to-ggml
+          '';
+        };
+        devShells.default = pkgs.mkShell {
+          packages = with pkgs; [
+            cmake
+            llama-python
+          ] ++ lib.optionals stdenv.isDarwin [
+            darwin.apple_sdk.frameworks.Accelerate
+          ];
+        };
+      }
+    );
+}
diff --git a/ggml.c b/ggml.c
index 535c7b7d2..4fb83adbd 100644
--- a/ggml.c
+++ b/ggml.c
@@ -607,10 +607,11 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
     assert(k % QK == 0);
 
     const int nb = k / QK;
+    const size_t bs = 2*sizeof(float) + QK/2;
 
-    float   * restrict pm = (float *)   (y);
-    float   * restrict pd = (float *)   (pm + nb);
-    uint8_t * restrict pb = (uint8_t *) (pd + nb);
+    uint8_t * restrict pd = ((uint8_t *)y + 0*bs);
+    uint8_t * restrict pm = ((uint8_t *)y + 0*bs +   sizeof(float));
+    uint8_t * restrict pb = ((uint8_t *)y + 0*bs + 2*sizeof(float));
 
     uint8_t pp[QK/2];
 
@@ -627,8 +628,10 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
         const float d = (max - min) / ((1 << 4) - 1);
         const float id = d ? 1.0f/d : 0.0f;
 
-        pm[i] = min;
-        pd[i] = d;
+        *(float *)pm = min;
+        *(float *)pd = d;
+        pm += bs;
+        pd += bs;
 
         for (int l = 0; l < QK; l += 2) {
             const float v0 = (x[i*QK + l + 0] - min)*id;
@@ -643,7 +646,8 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {
             pp[l/2] = vi0 | (vi1 << 4);
         }
 
-        memcpy(pb + i*QK/2, pp, sizeof(pp));
+        memcpy(pb, pp, sizeof(pp));
+        pb += bs;
     }
 }
 
@@ -687,16 +691,17 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {
     assert(k % QK == 0);
 
     const int nb = k / QK;
+    const size_t bs = 2*sizeof(float) + QK/2;
 
-    const float   * restrict pm = (const float *)   (x);
-    const float   * restrict pd = (const float *)   (pm + nb);
-    const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
+    const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
+    const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float));
+    const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
 
     for (int i = 0; i < nb; i++) {
-        const float m = pm[i];
-        const float d = pd[i];
+        const float d = *(const float *) (pd + i*bs);
+        const float m = *(const float *) (pm + i*bs);
 
-        const uint8_t * restrict pp = pb + i*QK/2;
+        const uint8_t * restrict pp = pb + i*bs;
 
         for (int l = 0; l < QK; l += 2) {
             const uint8_t vi = pp[l/2];
@@ -1584,28 +1589,109 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void
 inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) {
     const int nb = n / QK;
 
-    const float * restrict pm0 = (const float *) x;
-    const float * restrict pm1 = (const float *) y;
+    const size_t bs = 2*sizeof(float) + QK/2;
 
-    const float * restrict pd0 = (const float *) (pm0 + nb);
-    const float * restrict pd1 = (const float *) (pm1 + nb);
+    const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs);
+    const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs);
 
-    const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);
-    const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);
+    const uint8_t * restrict pm0 = ((const uint8_t *)x + 0*bs + sizeof(float));
+    const uint8_t * restrict pm1 = ((const uint8_t *)y + 0*bs + sizeof(float));
+
+    const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
+    const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + 2*sizeof(float));
 
     float sumf = 0.0;
 
-#if 1
+#if defined(__AVX2__)
+#if QK == 32
+    // Initialize accumulator with zeros
+    __m256 acc = _mm256_setzero_ps();
+    // Accumulator for constant offsets
+    float acc_offset = 0.0f;
+
+    // Main loop
+    for (int i = 0; i < nb; ++i) {
+        const float * m0 = (const float *) (pm0 + i*bs);
+        const float * m1 = (const float *) (pm1 + i*bs);
+
+        const float * d0 = (const float *) (pd0 + i*bs);
+        const float * d1 = (const float *) (pd1 + i*bs);
+
+        const uint8_t * restrict p0 = pb0 + i*bs;
+        const uint8_t * restrict p1 = pb1 + i*bs;
+
+        const __m256 d0v = _mm256_broadcast_ss( d0 );
+        const __m256 d1v = _mm256_broadcast_ss( d1 );
+        const __m256 m0v = _mm256_broadcast_ss( m0 );
+        const __m256 m1v = _mm256_broadcast_ss( m1 );
+
+
+        // Compute combined scale for the block
+        const __m256 scale_01 = _mm256_mul_ps( d0v, d1v );
+
+        // Compute cross scales for the block
+        const __m256 scale_0 = _mm256_mul_ps( d0v, m1v );
+        const __m256 scale_1 = _mm256_mul_ps( m0v, d1v );
+        const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 );
+
+        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
+        __m256i bx = bytesFromNibbles( p0 );
+        __m256i by = bytesFromNibbles( p1 );
+
+        // Now we have a vector with bytes in [ 0 .. 15 ] interval.
+
+        // Sign-extend first 16 signed bytes into int16_t
+        __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );
+        __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );
+        // Compute products of int16_t integers, add pairwise
+        __m256i i32 = _mm256_madd_epi16( x16, y16 );
+
+        // Sign-extend last 16 signed bytes into int16_t vectors
+        __m256i x16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );
+        __m256i y16_h = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );
+        // Accumulate products of int16_t integers
+        i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16_h, y16_h ) );
+
+        // compute sums of unsigned bytes in bx, by in blocks of 8.
+        // This results in a layout like X100 0000 X200 0000 X300 0000 X400 0000,
+        // which we then interleave as X100 Y100 X200 Y200 X300 Y300 X400 Y400.
+        // so if we then cast to 8 singles, we get 8 floats like [ x0_7, y0_7, x8_15, y8_15, x16_23, y16_23, x24_31, y24_31 ]
+        __m256i xsumi = _mm256_sad_epu8( bx, _mm256_setzero_si256() );
+        __m256i ysumi = _mm256_sad_epu8( by, _mm256_setzero_si256() );
+        __m256i sumsi = _mm256_or_si256( xsumi, _mm256_slli_si256( ysumi, 4 ) );
+        __m256  sums  = _mm256_cvtepi32_ps( sumsi );
+
+        // Convert int32_t to float
+        __m256 p = _mm256_cvtepi32_ps( i32 );
+        // Apply the scale, and accumulate
+        // acc += d0*d1*x*y + d0*m1*x + d1*m0*y
+        acc = _mm256_fmadd_ps( scale_01, p, acc );
+        acc = _mm256_fmadd_ps( cross_scales, sums, acc );
+        // acc_offset += m0*m1 (for each entry in the block)
+        acc_offset += (*m0)*(*m1);
+    }
+
+    // Return horizontal sum of the acc vector
+    __m128 res = _mm256_extractf128_ps( acc, 1 );
+    res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );
+    res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );
+    res = _mm_add_ss( res, _mm_movehdup_ps( res ) );
+
+    sumf = _mm_cvtss_f32( res ) + acc_offset * QK;
+#else
+#error "not implemented for QK"
+#endif
+#else
     // scalar
     for (int i = 0; i < nb; i++) {
-        const float m0 = pm0[i];
-        const float m1 = pm1[i];
+        const float m0 = *(const float *) (pm0 + i*bs);
+        const float m1 = *(const float *) (pm1 + i*bs);
 
-        const float d0 = pd0[i];
-        const float d1 = pd1[i];
+        const float d0 = *(const float *) (pd0 + i*bs);
+        const float d1 = *(const float *) (pd1 + i*bs);
 
-        const uint8_t * restrict p0 = pb0 + i*QK/2;
-        const uint8_t * restrict p1 = pb1 + i*QK/2;
+        const uint8_t * restrict p0 = pb0 + i*bs;
+        const uint8_t * restrict p1 = pb1 + i*bs;
 
         for (int j = 0; j < QK/2; j++) {
             const uint8_t v0 = p0[j];
@@ -1839,16 +1925,17 @@ inline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * res
     assert(n % QK == 0);
 
     const int nb = n / QK;
+    const size_t bs = 2*sizeof(float) + QK/2;
 
-    const float   * restrict pm = (const float *)   (x);
-    const float   * restrict pd = (const float *)   (pm + nb);
-    const uint8_t * restrict pb = (const uint8_t *) (pd + nb);
+    const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);
+    const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs +   sizeof(float)); 
+    const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float));
 
     for (int i = 0; i < nb; i++) {
-        const float m = pm[i];
-        const float d = pd[i];
+        const float d = *(const float *) (pd + i*bs);
+        const float m = *(const float *) (pm + i*bs);
 
-        const uint8_t * restrict pp = pb + i*QK/2;
+        const uint8_t * restrict pp = pb + i*bs;
 
         for (int l = 0; l < QK; l += 2) {
             const uint8_t vi = pp[l/2];
@@ -9231,10 +9318,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
 }
 
 void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {
-    if (cgraph->n_threads <= 0) {
-        cgraph->n_threads = 8;
-    }
-
     const int n_threads = cgraph->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
diff --git a/main.cpp b/main.cpp
index 4c2f85e23..a824d46c6 100644
--- a/main.cpp
+++ b/main.cpp
@@ -143,16 +143,8 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
 
     // load vocab
     {
-        const int32_t n_vocab = model.hparams.n_vocab;
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
         std::string word;
-        for (int i = 0; i < n_vocab; i++) {
+        for (int i = 0; i < model.hparams.n_vocab; i++) {
             uint32_t len;
             fin.read((char *) &len, sizeof(len));
 
@@ -825,6 +817,11 @@ int main(int argc, char ** argv) {
     if (gpt_params_parse(argc, argv, params) == false) {
         return 1;
     }
+    
+    if (params.n_ctx > 2048) {
+        fprintf(stderr, "%s: warning: model does not support context sizes greater than 2048 tokens (%d specified);"
+                "expect poor results\n", __func__, params.n_ctx);
+    }
 
     if (params.seed < 0) {
         params.seed = time(NULL);
@@ -870,6 +867,8 @@ int main(int argc, char ** argv) {
 
     std::vector<float> logits;
 
+    // Add a space in front of the first character to match OG llama tokenizer behavior
+    params.prompt.insert(0, 1, ' ');
     // tokenize the prompt
     std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
 
@@ -1048,7 +1047,7 @@ int main(int argc, char ** argv) {
                     if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
                     if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
                         // presumable empty line, consume the newline
-                        scanf("%*c");
+                        std::ignore = scanf("%*c");
                         n_read=0;
                     }
                     if(params.use_color) printf(ANSI_COLOR_RESET);
diff --git a/utils.cpp b/utils.cpp
index 20b6a86ce..966e9c91f 100644
--- a/utils.cpp
+++ b/utils.cpp
@@ -16,6 +16,18 @@
  #endif
 
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+    // determine sensible default number of threads.
+    // std::thread::hardware_concurrency may not be equal to the number of cores, or may return 0.
+#ifdef __linux__
+    std::ifstream cpuinfo("/proc/cpuinfo");
+    params.n_threads = std::count(std::istream_iterator<std::string>(cpuinfo),
+                                  std::istream_iterator<std::string>(),
+                                  std::string("processor"));
+#endif
+    if (params.n_threads == 0) {
+        params.n_threads = std::max(1, (int32_t) std::thread::hardware_concurrency());
+    }
+
     for (int i = 1; i < argc; i++) {
         std::string arg = argv[i];
 
@@ -277,40 +289,56 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
     return tokens;
 }
 
+// TODO: Calculate this constant from the vocabulary
+#define MAX_TOKEN_LEN 18
+// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
 std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
-    //auto res = gpt_tokenize(vocab, text);
-
-    //if (bos) {
-    //    res.insert(res.begin(), 1); // TODO: replace with vocab.bos
-    //}
-
     std::vector<gpt_vocab::id> res;
+    std::vector<int> score;
+    std::vector<gpt_vocab::id> prev;
+    int len = text.length();
+
+    score.resize(len + 1);
+    prev.resize(len + 1);
+
+    // Forward pass
+    for (int i = 0; i < len; i++) {
+        int max_len = std::min(len - i, MAX_TOKEN_LEN);
+        for (int sub_len = 1; sub_len <= max_len; sub_len++) {
+            auto sub = text.substr(i, sub_len);
+            auto token = vocab.token_to_id.find(sub);
+            if (token != vocab.token_to_id.end()) {
+                int token_score = sub.length() * sub.length();
+                int local_score = score[i] + token_score;
+                int next = i + sub_len;
+                if (score[next] < local_score) {
+                    score[next] = local_score;
+                    prev[next] = (*token).second;
+                }
+            }
+        }
+    }
+
+    // Backward pass
+    int i = len;
+    while (i > 0) {
+        gpt_vocab::id token_id = prev[i];
+        if (token_id == 0) {
+	    // TODO: Return error or something more meaningful
+            printf("failed to tokenize string!\n");
+	    break;
+        }
+        res.push_back(token_id);
+        auto token = (*vocab.id_to_token.find(token_id)).second;
+        i -= token.length();
+    }
 
     if (bos) {
         res.push_back(1); // TODO: replace with vocab.bos
     }
 
-     //find the longest token that matches the text
-    int pos = 0;
-    while (true) {
-        int l = 0;
-        int t = 0;
-        for (const auto & kv : vocab.id_to_token) {
-            if (kv.second.size() < l) continue;
-            if (kv.second.size() > text.size() - pos) continue;
-            if (text.substr(pos, kv.second.size()) == kv.second) {
-                l = kv.second.size();
-                t = kv.first;
-            }
-        }
-
-        if (l == 0) {
-            break;
-        }
-
-        res.push_back(t);
-        pos += l;
-    }
+    // Pieces are in reverse order so correct that
+    std::reverse(res.begin(), res.end());
 
     return res;
 }
@@ -491,7 +519,8 @@ size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t
 
 size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
     const int nb = k / qk;
-    const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);
+    const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2);
+    const size_t row_size = nb*bs;
 
     assert(k % qk == 0);
 
@@ -500,10 +529,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
 
     char * pdst = (char *) dst;
 
-    for (int j = 0; j < n; j += k) {
-        float   * pm = (float *)   (pdst + (j/k)*row_size);
-        float   * pd = (float *)   (pm + nb);
-        uint8_t * pb = (uint8_t *) (pd + nb);
+    for (int j = 0; j < n; j += k) { 
+        uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
+        uint8_t * pm = (uint8_t *) (pdst + (j/k)*row_size + 0*bs +   sizeof(float));
+        uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float));
 
         //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
 
@@ -521,8 +550,10 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
                 const float d = (max - min) / ((1 << 4) - 1);
                 const float id = d ? 1.0f/d : 0.0f;
 
-                pm[i] = min;
-                pd[i] = d;
+                *(float *) pd = d;
+                *(float *) pm = min;
+                pd += bs; 
+                pm += bs;
 
                 for (int l = 0; l < qk; l += 2) {
                     const float v0 = (src[j + i*qk + l + 0] - min)*id;
@@ -540,7 +571,8 @@ size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t
                     pp[l/2] = vi0 | (vi1 << 4);
                 }
 
-                memcpy(pb + i*qk/2, pp, pp_size);
+                memcpy(pb, pp, pp_size);
+                pb += bs;
             }
         }
     }
diff --git a/utils.h b/utils.h
index dca497d06..ae263d452 100644
--- a/utils.h
+++ b/utils.h
@@ -18,7 +18,7 @@ struct gpt_params {
     int32_t n_predict = 128; // new tokens to predict
     int32_t repeat_last_n = 64;  // last n tokens to penalize
     int32_t n_ctx = 512; //context size
-    
+
     // sampling parameters
     int32_t top_k = 40;
     float   top_p = 0.95f;