Merge branch 'master' of github.com:ggerganov/llama.cpp

2023-08-14 09:21:10 +02:00 · 2023-08-14 09:21:10 +02:00 · dbdb2c1353
commit dbdb2c1353
parent 2c8e92044e 5517d6e692
23 changed files with 4960 additions and 3011 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,7 @@
 *.o
 *.a
 *.so
+*.bin
 .DS_Store
 .build/
 .cache/
@ -39,6 +40,7 @@ models-mnt
 /perplexity
 /embedding
 /train-text-from-scratch
+/convert-llama2c-to-ggml
 /simple
 /benchmark-matmult
 /vdot
@ -68,6 +70,7 @@ poetry.lock
 poetry.toml

 # Test binaries
+tests/test-grammar-parser
 tests/test-double-float
 tests/test-grad0
 tests/test-opt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -69,7 +69,6 @@ option(LLAMA_BLAS                            "llama: use BLAS"
 set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUBLAS                          "llama: use CUDA"                                  OFF)
 #option(LLAMA_CUDA_CUBLAS                     "llama: use cuBLAS for prompt processing"          OFF)
-set(LLAMA_CUDA_MMQ_Y       "64" CACHE STRING "llama: y tile size for mmq CUDA kernels")
 option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
@ -256,7 +255,6 @@ if (LLAMA_CUBLAS)
 #        if (LLAMA_CUDA_CUBLAS)
 #            add_compile_definitions(GGML_CUDA_CUBLAS)
 #        endif()
-        add_compile_definitions(GGML_CUDA_MMQ_Y=${LLAMA_CUDA_MMQ_Y})
        if (LLAMA_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
--- a/12
+++ b/12
@ -1,8 +1,8 @@
 # Define the default target now so that it is always the first target
-BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch simple server embd-input-test
+BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test

 # Binaries only useful for tests
-TEST_TARGETS = tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
+TEST_TARGETS = tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0

 default: $(BUILD_TARGETS)

@ -345,7 +345,7 @@ libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)

 clean:
-	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch embd-input-test build-info.h $(TEST_TARGETS)
+	rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test build-info.h $(TEST_TARGETS)

 #
 # Examples
@ -388,6 +388,9 @@ embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-te
 train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp    build-info.h ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

+convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp    build-info.h ggml.o llama.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
 build-info.h: $(wildcard .git/index) scripts/build-info.sh
 	@sh scripts/build-info.sh > $@.tmp
 	@if ! cmp -s $@.tmp $@; then \
@ -409,6 +412,9 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o
 vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)

+tests/test-grammar-parser: tests/test-grammar-parser.cpp examples/grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
+
 tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -42,6 +42,7 @@ else()
    add_subdirectory(benchmark)
    add_subdirectory(baby-llama)
    add_subdirectory(train-text-from-scratch)
+    add_subdirectory(convert-llama2c-to-ggml)
    add_subdirectory(simple)
    add_subdirectory(embd-input)
    if (LLAMA_METAL)
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -543,7 +543,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stdout, "  --in-suffix STRING    string to suffix after user inputs with (default: empty)\n");
    fprintf(stdout, "  -f FNAME, --file FNAME\n");
    fprintf(stdout, "                        prompt file to start generation.\n");
-    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict);
+    fprintf(stdout, "  -n N, --n-predict N   number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
    fprintf(stdout, "  -c N, --ctx-size N    size of the prompt context (default: %d)\n", params.n_ctx);
    fprintf(stdout, "  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stdout, "  -gqa N, --gqa N       grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa);
--- a/examples/console.cpp
+++ b/examples/console.cpp
@ -10,6 +10,9 @@
 #include <windows.h>
 #include <fcntl.h>
 #include <io.h>
+#ifndef ENABLE_VIRTUAL_TERMINAL_PROCESSING
+#define ENABLE_VIRTUAL_TERMINAL_PROCESSING 0x0004
+#endif
 #else
 #include <climits>
 #include <sys/ioctl.h>
@ -68,9 +71,10 @@ namespace console {
            }
        }
        if (hConsole) {
-            // Enable ANSI colors on Windows 10+
-            if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
-                SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING);
+            // Check conditions combined to reduce nesting
+            if (advanced_display && !(dwMode & ENABLE_VIRTUAL_TERMINAL_PROCESSING) &&
+                !SetConsoleMode(hConsole, dwMode | ENABLE_VIRTUAL_TERMINAL_PROCESSING)) {
+                advanced_display = false;
            }
            // Set console output codepage to UTF8
            SetConsoleOutputCP(CP_UTF8);
--- a/examples/convert-llama2c-to-ggml/CMakeLists.txt
+++ b/examples/convert-llama2c-to-ggml/CMakeLists.txt
@ -0,0 +1,5 @@
+set(TARGET convert-llama2c-to-ggml)
+add_executable(${TARGET} convert-llama2c-to-ggml.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/convert-llama2c-to-ggml/README.md
+++ b/examples/convert-llama2c-to-ggml/README.md
@ -0,0 +1,26 @@
+## Convert llama2.c model to ggml
+
+This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
+
+To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
+
+`$ make -j`
+
+After successful compilation, following usage options are available:
+```
+usage: ./convert-llama2c-to-ggml [options]
+
+options:
+  -h, --help                       show this help message and exit
+  --copy-vocab-from-model FNAME    model path from which to copy vocab (default 'models/ggml-vocab.bin')
+  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model
+  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default ak_llama_model.bin')
+```
+
+An example command is as follows:
+
+`$ ./convert-llama2c-to-ggml --copy-vocab-from-model <ggml-vocab.bin> --llama2c-model <llama2.c model path> --llama2c-output-model <ggml output model path>`
+
+Now you can use the model with command like:
+
+`$ ./main -m <ggml output model path> -p "One day, Lily met a Shoggoth" -n 500 -c 256 -eps 1e-5`
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@ -0,0 +1,825 @@
+#include "ggml.h"
+#include "llama.h"
+#include <unordered_map>
+#include <vector>
+#include <cassert>
+#include <climits>
+#include <cstring>
+#include <cstdarg>
+#include <ctime>
+#include <random>
+#include <stdexcept>
+#include <algorithm>
+#include <string>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+//////////////////////////////////////// llama2.c model structs and functions to load models, alloc memory etc.
+typedef struct {
+    int dim; // transformer dimension
+    int hidden_dim; // for ffn layers
+    int n_layers; // number of layers
+    int n_heads; // number of query heads
+    int n_kv_heads; // number of key/value heads (can be < query heads because of multiquery)
+    int vocab_size; // vocabulary size, usually 256 (byte-level)
+    int seq_len; // max sequence length
+} Config;
+
+typedef struct {
+    // token embedding table
+    float* token_embedding_table;    // (vocab_size, dim)
+    // weights for rmsnorms
+    float* rms_att_weight; // (layer, dim) rmsnorm weights
+    float* rms_ffn_weight; // (layer, dim)
+    // weights for matmuls
+    float* wq; // (layer, dim, dim)
+    float* wk; // (layer, dim, dim)
+    float* wv; // (layer, dim, dim)
+    float* wo; // (layer, dim, dim)
+    // weights for ffn
+    float* w1; // (layer, hidden_dim, dim)
+    float* w2; // (layer, dim, hidden_dim)
+    float* w3; // (layer, hidden_dim, dim)
+    // final rmsnorm
+    float* rms_final_weight; // (dim,)
+    // freq_cis for RoPE relatively positional embeddings
+    // float* freq_cis_real; // (seq_len, dim/2)
+    // float* freq_cis_imag; // (seq_len, dim/2)
+    // (optional) classifier weights for the logits, on the last layer
+    //float* wcls;
+} TransformerWeights;
+
+void malloc_weights(TransformerWeights* w, Config* p) {
+    // we calloc instead of malloc to keep valgrind happy
+    w->token_embedding_table = new float[p->vocab_size * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
+
+    w->rms_att_weight = new float[p->n_layers * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim);
+
+    w->rms_ffn_weight = new float[p->n_layers * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim);
+
+    w->wq = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wk = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wv = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->wo = new float[p->n_layers * p->dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim);
+
+    w->w1 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->w2 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->w3 = new float[p->n_layers * p->hidden_dim * p->dim]();
+    printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim);
+
+    w->rms_final_weight = new float[p->dim]();
+    printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim);
+}
+
+int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f) {
+    if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
+    if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
+    if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
+    if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->hidden_dim * p->dim)) return 1;
+    if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->hidden_dim)) return 1;
+    if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast<size_t>(p->dim)) return 1;
+    return 0;
+}
+
+void free_weights(TransformerWeights* w) {
+    delete w->token_embedding_table;
+    delete w->rms_att_weight;
+    delete w->rms_ffn_weight;
+    delete w->wq;
+    delete w->wk;
+    delete w->wv;
+    delete w->wo;
+    delete w->w1;
+    delete w->w2;
+    delete w->w3;
+    delete w->rms_final_weight;
+}
+
+void print_sample_weights(TransformerWeights *w){
+    printf("----- Quick print of first of the weight vales of all the variables\n");
+    printf("%f\n", w->token_embedding_table[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+    printf("%f\n", w->rms_ffn_weight[0]);
+
+    printf("%f\n", w->wq[0]);
+    printf("%f\n", w->wk[0]);
+    printf("%f\n", w->wv[0]);
+    printf("%f\n", w->wo[0]);
+    printf("%f\n", w->w1[0]);
+    printf("%f\n", w->w2[0]);
+    printf("%f\n", w->w3[0]);
+    printf("%f\n", w->rms_att_weight[0]);
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////// ggml structs and functions required to load models, configs and save the model.
+
+struct llama_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    struct token_score {
+        token tok;
+        float score;
+    };
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_score> id_to_token;
+};
+
+struct my_llama_hparams {
+    uint32_t n_vocab = 32000;
+    uint32_t n_ctx   = 512;   // this is provided as user input?
+    uint32_t n_embd  = 4096;
+    uint32_t n_mult  = 4;
+    uint32_t n_head  = 32;
+    uint32_t n_layer = 32;
+    uint32_t n_rot   = 64;
+    bool operator!=(const my_llama_hparams& other) const {
+        return memcmp(this, &other, sizeof(my_llama_hparams));
+    }
+};
+
+struct my_llama_layer {
+    // normalization
+    struct ggml_tensor * attention_norm;
+
+    // attention
+    struct ggml_tensor * wq;
+    struct ggml_tensor * wk;
+    struct ggml_tensor * wv;
+    struct ggml_tensor * wo;
+
+    // normalization
+    struct ggml_tensor * ffn_norm;
+
+    // ff
+    struct ggml_tensor * w1;
+    struct ggml_tensor * w2;
+    struct ggml_tensor * w3;
+};
+
+struct my_llama_model {
+    struct ggml_context * ctx = NULL;
+
+    my_llama_hparams hparams;
+
+    struct ggml_tensor * tok_embeddings;
+
+    struct ggml_tensor * norm;
+    struct ggml_tensor * output;
+
+    std::vector<my_llama_layer> layers;
+
+    uint32_t train_its = 0;
+    uint32_t train_samples = 0;
+    uint32_t train_tokens = 0;
+};
+
+struct train_params {
+    const char * fn_vocab_model;
+    const char * fn_llama2c_model;
+    const char * fn_llama2c_output_model;
+    const char * fn_train_data;
+    const char * fn_checkpoint_in;
+    const char * fn_checkpoint_out;
+    const char * fn_model_out;
+
+    uint32_t seed;
+
+    int n_ctx;
+    int n_embd;
+    int n_mult;
+    int n_head;
+    int n_layer;
+    int n_rotmax;
+
+    int n_threads;
+    int n_batch;
+    int n_examples;
+    int n_predict;
+
+    int print_info_interval;
+    int print_details_interval;
+
+    bool samples_start_after_nl;
+    bool use_adam;
+    bool use_flash;
+    bool use_scratch;
+
+    // only adam
+    int   warmup;
+    int   cos_decay_steps;
+    float cos_decay_restart;
+    float cos_decay_alpha;
+
+    int   lbfgs_n_iter;
+    int   adam_n_iter;
+    float adam_alpha;
+    float adam_decay;
+
+    int mem_model_gb;
+    int mem_compute_gb;
+    int mem_compute0_gb;
+    int mem_compute1_gb;
+};
+
+uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
+    const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
+    return n_ff;
+}
+
+void print_params(struct my_llama_hparams * params) {
+    printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
+    printf("%s: n_ctx:   %d\n", __func__, params->n_ctx);
+    printf("%s: n_embd:  %d\n", __func__, params->n_embd);
+    printf("%s: n_mult:  %d\n", __func__, params->n_mult);
+    printf("%s: n_head:  %d\n", __func__, params->n_head);
+    printf("%s: n_ff:    %d\n", __func__, get_n_ff(params));
+    printf("%s: n_layer: %d\n", __func__, params->n_layer);
+    printf("%s: n_rot:   %d\n", __func__, params->n_rot);
+}
+
+void init_model(struct my_llama_model * model) {
+    const auto & hparams = model->hparams;
+
+    const uint32_t n_embd  = hparams.n_embd;
+    const uint32_t n_layer = hparams.n_layer;
+    const uint32_t n_vocab = hparams.n_vocab;
+
+    const uint32_t n_ff = get_n_ff(&hparams);
+    struct ggml_context * ctx = model->ctx;
+
+    model->train_its = 0;
+    model->train_samples = 0;
+    model->train_tokens = 0;
+
+    model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x [%d] = [%d] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab);
+
+    model->norm           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+    printf("[%s:GG] Allocating [%d] float space for model->norm\n",__func__,n_embd);
+
+    model->output         = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab);
+
+    // printing the per-layer allocations here so we dont print in the for loop.
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wq for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wk for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wv for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.wo for [%d] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%d] float space for layer.ffn_norm for [%d] layers\n",__func__,n_embd, n_layer);
+
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w1 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w2 for [%d] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer);
+    printf("[%s:GG] Allocating [%d] x[%d] = [%d] float space for layer.w3 for [%d] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer);
+
+    ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
+    ggml_set_name(model->norm,           "norm.weight");
+    ggml_set_name(model->output,         "output.weight");
+
+    model->layers.resize(n_layer);
+    for (uint32_t i = 0; i < n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        std::string layers_i = "layers." + std::to_string(i);
+
+        layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+        layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
+
+        layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+        layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
+        layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
+
+        ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
+
+        ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
+        ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
+        ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
+        ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
+
+        ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
+
+        ggml_format_name(layer.w1, "%s.feed_forward.w1.weight", layers_i.c_str());
+        ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str());
+        ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str());
+    }
+}
+
+float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
+    int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
+    return *ptr;
+}
+
+void print_row(struct ggml_tensor * probs, int i) {
+    for (int k = 0; k < probs->ne[0]; ++k) {
+        float p = get_f32_2d(probs, k, i);
+        printf(" %f", p);
+    }
+    printf("\n");
+}
+
+void print_matrix(struct ggml_tensor * probs) {
+    assert(probs->n_dims == 2);
+    for (int i = 0; i < probs->ne[1]; ++i) {
+        for (int k = 0; k < probs->ne[0]; ++k) {
+            float p = get_f32_2d(probs, k, i);
+            printf(" %.2f", p);
+        }
+        printf("\n");
+    }
+}
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
+__attribute__((format(printf, 1, 2)))
+#endif
+#endif
+static std::string format(const char * fmt, ...) {
+    va_list ap, ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX);
+    std::vector<char> buf(size + 1);
+    int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return std::string(buf.data(), size);
+}
+
+struct llama_file {
+    // use FILE * so we don't have to re-open the file to mmap
+    FILE * fp;
+    size_t size;
+
+    llama_file(const char * fname, const char * mode) {
+        fp = std::fopen(fname, mode);
+        if (fp == NULL) {
+            size = 0;
+        } else {
+            seek(0, SEEK_END);
+            size = tell();
+            seek(0, SEEK_SET);
+        }
+    }
+
+    size_t tell() const {
+#ifdef _WIN32
+        __int64 ret = _ftelli64(fp);
+#else
+        long ret = std::ftell(fp);
+#endif
+        GGML_ASSERT(ret != -1); // this really shouldn't fail
+        return (size_t) ret;
+    }
+
+    void seek(size_t offset, int whence) {
+#ifdef _WIN32
+        int ret = _fseeki64(fp, (__int64) offset, whence);
+#else
+        int ret = std::fseek(fp, (long) offset, whence);
+#endif
+        GGML_ASSERT(ret == 0); // same
+    }
+
+    void read_raw(void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        std::size_t ret = std::fread(ptr, size, 1, fp);
+        if (ferror(fp)) {
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
+        }
+        if (ret != 1) {
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
+        }
+    }
+
+    std::uint32_t read_u32() {
+        std::uint32_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+    std::float_t read_f32() {
+        std::float_t ret;
+        read_raw(&ret, sizeof(ret));
+        return ret;
+    }
+
+    std::string read_string(std::uint32_t len) {
+        std::vector<char> chars(len);
+        read_raw(chars.data(), len);
+        return std::string(chars.data(), len);
+    }
+
+    void write_raw(const void * ptr, size_t size) {
+        if (size == 0) {
+            return;
+        }
+        errno = 0;
+        size_t ret = std::fwrite(ptr, size, 1, fp);
+        if (ret != 1) {
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
+        }
+    }
+
+    void write_u32(std::uint32_t val) {
+        write_raw(&val, sizeof(val));
+    }
+
+    ~llama_file() {
+        if (fp) {
+            std::fclose(fp);
+        }
+    }
+};
+
+void write_tensor(struct llama_file * file, struct ggml_tensor * tensor) {
+    if (tensor == NULL) {
+        file->write_u32(0);
+        file->write_u32(0);
+        file->write_u32(GGML_TYPE_F32);
+        file->seek((0-file->tell()) & 31, SEEK_CUR);
+        return;
+    }
+    const char * name = ggml_get_name(tensor);
+    uint32_t name_len = strlen(name);
+    uint32_t nd = tensor->n_dims;
+    uint32_t ne[4] = { (uint32_t)tensor->ne[0],
+                       (uint32_t)tensor->ne[1],
+                       (uint32_t)tensor->ne[2],
+                       (uint32_t)tensor->ne[3] };
+    file->write_u32(nd);
+    file->write_u32(name_len);
+    file->write_u32(tensor->type);
+    file->write_raw(ne, sizeof(ne[0]) * nd);
+    file->write_raw(name, name_len);
+    file->seek((0-file->tell()) & 31, SEEK_CUR);
+    file->write_raw(tensor->data, ggml_nbytes(tensor));
+}
+
+bool is_ggml_file(const char *filename) {
+    llama_file file(filename, "rb");
+    if (file.size < 4) {
+        return false;
+    }
+    uint32_t magic = file.read_u32();
+    return magic == LLAMA_FILE_MAGIC;
+}
+
+void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
+    // heuristic to infer whether vocab is from ggml or from llama2.c vocabulary
+    if (is_ggml_file(filename)) {
+
+        struct llama_context_params llama_params = llama_context_default_params();
+        llama_params.vocab_only = true;
+
+        struct llama_model * lmodel = llama_load_model_from_file(filename, llama_params);
+        struct llama_context * lctx = llama_new_context_with_model(lmodel, llama_params);
+
+        std::vector<const char *> strings;
+        std::vector<float> scores;
+        int n_vocab = llama_n_vocab(lctx);
+        strings.resize(n_vocab, NULL);
+        scores.resize(n_vocab, 0);
+        n_vocab = llama_get_vocab(lctx, strings.data(), scores.data(), n_vocab);
+        GGML_ASSERT(n_vocab == llama_n_vocab(lctx));
+        vocab->id_to_token.resize(n_vocab);
+        for (int i=0; i<n_vocab; ++i) {
+            std::string tok   = std::string(strings[i]);
+            float       score = scores[i];
+            vocab->id_to_token[i].tok   = tok;
+            vocab->id_to_token[i].score = score;
+            vocab->token_to_id.emplace(tok, i);
+        }
+        llama_free(lctx);
+        llama_free_model(lmodel);
+    } else { // assume llama2.c vocabulary
+        printf("Assuming llama2.c vocabulary since %s is not a ggml file\n", filename);
+        llama_file file(filename, "rb");
+        uint32_t n_vocab = config->vocab_size;
+        /* uint32_t max_token_length =  */ file.read_u32(); // unused
+        vocab->id_to_token.resize(n_vocab);
+        for (uint32_t i=0; i<n_vocab; ++i) {
+            float_t score = file.read_f32();
+            uint32_t len = file.read_u32();
+            std::string tok = file.read_string(len);
+            vocab->id_to_token[i].tok = tok;
+            vocab->id_to_token[i].score = score;
+            vocab->token_to_id.emplace(tok, i);
+        }
+    }
+}
+
+void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
+    int ct;
+    switch (gg_weights->n_dims){
+        case 1:
+            ct = 0;
+            for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){
+                float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]);
+                *ptr = karpathy_weights[ct];
+                ct++;
+            }
+            break;
+        case 2:
+            ct = 0;
+            for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                    float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]);
+                    *ptr = karpathy_weights[ct];
+                    ct++;
+                }
+            }
+            break;
+        case 3:
+            ct = 0;
+            for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) {
+                for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) {
+                    for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) {
+                        float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]);
+                        *ptr = karpathy_weights[ct];
+                        ct++;
+                    }
+                }
+            }
+            break;
+    }
+}
+
+void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
+    struct llama_file file(filename, "wb");
+    if (file.fp == NULL) {
+        return;
+    }
+    // write_magic
+    file.write_u32(LLAMA_FILE_MAGIC);   // magic
+    file.write_u32(LLAMA_FILE_VERSION); // version
+    // write_hparams
+    file.write_u32(model->hparams.n_vocab);
+    file.write_u32(model->hparams.n_embd);
+    file.write_u32(model->hparams.n_mult);
+    file.write_u32(model->hparams.n_head);
+    file.write_u32(model->hparams.n_layer);
+    file.write_u32(model->hparams.n_rot);
+    file.write_u32(LLAMA_FTYPE_ALL_F32);
+
+    // write_vocab - for now we are just writing the existing BPE voc. assuming karpathy's vocabulary is the same. idk.
+    uint32_t n_vocab = model->hparams.n_vocab;
+    for (uint32_t i = 0; i < n_vocab; i++) {
+        const auto & token_score = vocab->id_to_token.at(i);
+        file.write_u32((uint32_t) token_score.tok.size());
+        file.write_raw(token_score.tok.data(), token_score.tok.size());
+        file.write_raw(&token_score.score, sizeof(token_score.score));
+    }
+
+    // stuff AK weights into GG weights one by one.
+    // w->token_embedding_table -> model->tok_embeddings
+    // float*                   -> struct ggml_tensor
+    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+    stuff_karpathy_weights_into_gg(model->output, w->token_embedding_table);
+
+    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
+    //print_row(model->norm, 0);
+
+    // for rms-att-weight
+    int row_length = model->hparams.n_embd;
+    const auto & hparams = model->hparams;
+    //int n_ff = model->hparams.n_embd;
+    int n_ff = get_n_ff(&hparams);
+
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
+        auto & layer = model->layers[i];
+        // 1d
+        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+
+        // from 3d matrix layer x dim x dim to 2d matrix dim x dim
+        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+
+        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
+    }
+    // write tensors
+    write_tensor(&file, model->tok_embeddings);
+    write_tensor(&file, model->norm);
+    write_tensor(&file, model->output); // ?
+    for (uint32_t i = 0; i < model->hparams.n_layer; ++i) {
+        auto & layer = model->layers[i];
+
+        write_tensor(&file, layer.attention_norm);
+        write_tensor(&file, layer.wq);
+        write_tensor(&file, layer.wk);
+        write_tensor(&file, layer.wv);
+        write_tensor(&file, layer.wo);
+        write_tensor(&file, layer.ffn_norm);
+        write_tensor(&file, layer.w1);
+        write_tensor(&file, layer.w2);
+        write_tensor(&file, layer.w3);
+    }
+}
+
+struct train_params get_default_train_params() {
+    struct train_params params;
+    params.fn_vocab_model    = "models/ggml-vocab.bin";
+    params.fn_llama2c_output_model = "ak_llama_model.bin";
+    params.fn_train_data     = "shakespeare.txt";
+    params.fn_checkpoint_in  = "checkpoint.bin";
+    params.fn_checkpoint_out = "checkpoint.bin";
+    params.fn_model_out      = "ggml-checkpoint-f32.bin";
+
+    params.seed       =   -1;
+
+    params.n_ctx      =  128;
+    params.n_embd     =  256;
+    params.n_mult     =  256;
+    params.n_head     =    8;
+    params.n_layer    =   16;
+    params.n_rotmax   =   64;
+
+    params.n_threads  =    6;
+    params.n_batch    =    8;
+    params.n_examples =    8;
+    params.n_predict  = 1024;
+
+    params.print_info_interval    = 1;
+    params.print_details_interval = 2;
+
+    params.samples_start_after_nl = false;
+    params.use_adam               = true;
+    params.use_flash              = true;
+    params.use_scratch            = true;
+
+    // only adam
+    params.warmup            =  100;
+    params.cos_decay_steps   = 1000;
+    params.cos_decay_restart = 1.1f;
+    params.cos_decay_alpha   = 0.0f;
+
+    params.lbfgs_n_iter      = 16;
+    params.adam_n_iter       = 16;
+    params.adam_alpha        = 1e-3f;
+    params.adam_decay        = 1e-3f;
+
+    params.mem_model_gb   = 2;
+    params.mem_compute_gb = 24;
+    params.mem_compute0_gb = 8;
+    params.mem_compute1_gb = 2;
+
+    return params;
+}
+
+void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help                       show this help message and exit\n");
+    fprintf(stderr, "  --copy-vocab-from-model FNAME    llama2.c vocabulary or ggml model path from which to copy vocab (default '%s')\n", params->fn_vocab_model);
+    fprintf(stderr, "  --llama2c-model FNAME            [REQUIRED] model path from which to load Karpathy's llama2.c model\n");
+    fprintf(stderr, "  --llama2c-output-model FNAME     model path to save the converted llama2.c model (default %s')\n", params->fn_llama2c_output_model);
+    fprintf(stderr, "\n");
+}
+
+bool params_parse(int argc, char ** argv, struct train_params * params) {
+    bool invalid_param = false;
+    bool reqd_param_found = false;
+    std::string arg;
+    struct train_params default_params = get_default_train_params();
+    const std::string arg_prefix = "--";
+
+    for (int i = 1; i < argc; i++) {
+        arg = argv[i];
+        if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+            std::replace(arg.begin(), arg.end(), '_', '-');
+        }
+
+        if (arg == "--copy-vocab-from-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_vocab_model = argv[i];
+        } else if (arg == "--llama2c-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            reqd_param_found = true;
+            params->fn_llama2c_model = argv[i];
+        } else if (arg == "--llama2c-output-model") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params->fn_llama2c_output_model = argv[i];
+        } else if (arg == "-h" || arg == "--help") {
+            print_usage(argc, argv, &default_params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            print_usage(argc, argv, &default_params);
+            exit(1);
+        }
+    }
+    if (invalid_param) {
+        fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
+        print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+    if (!reqd_param_found){
+        fprintf(stderr, "error: please specify a llama2.c .bin file to be converted with argument --llama2c-model\n");
+        print_usage(argc, argv, &default_params);
+        exit(1);
+    }
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    struct train_params params = get_default_train_params();
+    if (!params_parse(argc, argv, &params)) {
+        return 1;
+    }
+    Config config;
+    TransformerWeights weights;
+    {
+        FILE *file = fopen(params.fn_llama2c_model, "rb");
+        if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
+        // read in the config header
+        if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; }
+        // read in the Transformer weights
+        malloc_weights(&weights, &config);
+        if(checkpoint_init_weights(&weights, &config, file)) { return 1; }
+        fclose(file);
+    }
+
+    struct llama_vocab vocab;
+    load_vocab(params.fn_vocab_model, &config, &vocab);
+
+    struct my_llama_model model;
+    model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx);
+    model.hparams.n_ctx   = params.n_ctx;
+    model.hparams.n_embd  = config.dim; //params.n_embd;
+    model.hparams.n_mult  = 32;//params.n_mult;
+    model.hparams.n_head  = config.n_heads; //params.n_head;
+    model.hparams.n_layer = config.n_layers; //params.n_layer;
+    model.hparams.n_rot   = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head);
+    print_params(&model.hparams);
+    struct ggml_init_params lcparams;
+    lcparams.mem_size   = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb);
+    lcparams.mem_buffer = NULL;
+    lcparams.no_alloc   = false;
+
+    model.ctx = ggml_init(lcparams);
+
+    init_model(&model);
+    save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model);
+
+    printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
+
+    ggml_free(model.ctx);
+    free_weights(&weights);
+    return 0;
+}
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -160,9 +160,13 @@ The following options allow you to control the text generation process and fine-

 ### Number of Tokens to Predict

-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity).
+-   `-n N, --n-predict N`: Set the number of tokens to predict when generating text (default: 128, -1 = infinity, -2 = until context filled)

-The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text. A value of -1 will cause text to be generated without limit.
+The `--n-predict` option controls the number of tokens the model generates in response to the input prompt. By adjusting this value, you can influence the length of the generated text. A higher value will result in longer text, while a lower value will produce shorter text.
+
+A value of -1 will enable infinite text generation, even though we have a finite context window. When the context window is full, some of the earlier tokens (half of the tokens after `--n-keep`) will be discarded. The context must then be re-evaluated before generation can resume. On large models and/or large context windows, this will result in significant pause in output.
+
+If the pause is undesirable, a value of -2 will stop generation immediately when the context is filled.

 It is important to note that the generated text may be shorter than the specified number of tokens if an End-of-Sequence (EOS) token or a reverse prompt is encountered. In interactive mode text generation will pause and control will be returned to the user. In non-interactive mode, the program will end. In both cases, the text generation may stop before reaching the specified `n-predict` value. If you want the model to keep going without ever producing End-of-Sequence on its own, you can use the `--ignore-eos` parameter.

--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -431,8 +431,12 @@ int main(int argc, char ** argv) {
            // - take the n_keep first tokens from the original prompt (via n_past)
            // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
            if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
-                const int n_left = n_past - params.n_keep;
+                if (params.n_predict == -2) {
+                    fprintf(stderr, "\n\n%s: context full, stopping generation\n", __func__);
+                    break;
+                }

+                const int n_left = n_past - params.n_keep;
                // always keep the first token - BOS
                n_past = std::max(1, params.n_keep);
                n_past_guidance = std::max(1, params.n_keep + guidance_offset);
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@ -1,5 +1,34 @@
 import * as readline from 'node:readline'
 import { stdin, stdout } from 'node:process'
+import { readFileSync } from 'node:fs'
+import { SchemaConverter }  from './public/json-schema-to-grammar.mjs'
+
+const args = process.argv.slice(2);
+const grammarJsonSchemaFile = args.find(
+    (_, index) => args[index - 1] === "--grammar-json-schema"
+);
+const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");
+
+// Example usage: function,arguments
+const grammarJsonSchemaPropOrder = args.find(
+    (_, index) => args[index - 1] === "--grammar-json-schema-prop-order"
+);
+const propOrder = grammarJsonSchemaPropOrder
+    ? grammarJsonSchemaPropOrder
+          .split(",")
+          .reduce((acc, cur, index) => ({ ...acc, [cur]: index }), {})
+    : {};
+
+let grammar = null
+if (grammarJsonSchemaFile) {
+    const schema = JSON.parse(readFileSync(grammarJsonSchemaFile, 'utf-8'))
+    const converter = new SchemaConverter(propOrder)
+    converter.visit(schema, '')
+    grammar = converter.formatGrammar()
+}
+if (grammarFile) {
+    grammar = readFileSync(grammarFile, 'utf-8')
+}

 const API_URL = 'http://127.0.0.1:8080'

@ -48,6 +77,7 @@ async function chat_completion(question) {
            n_keep: n_keep,
            n_predict: 256,
            stop: ["\n### Human:"], // stop completion after generating this
+            grammar,
            stream: true,
        })
    })
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/index.js.hpp
+++ b/examples/server/index.js.hpp
--- a/examples/server/json-schema-to-grammar.mjs.hpp
+++ b/examples/server/json-schema-to-grammar.mjs.hpp
@ -0,0 +1,311 @@
+unsigned char json_schema_to_grammar_mjs[] = {
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f,
+  0x52, 0x55, 0x4c, 0x45, 0x20, 0x3d, 0x20, 0x27, 0x22, 0x20, 0x22, 0x3f,
+  0x27, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x52,
+  0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45,
+  0x53, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x62, 0x6f, 0x6f, 0x6c,
+  0x65, 0x61, 0x6e, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x74, 0x72, 0x75, 0x65,
+  0x22, 0x20, 0x7c, 0x20, 0x22, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x22, 0x29,
+  0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x6e,
+  0x75, 0x6d, 0x62, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d, 0x22,
+  0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20, 0x5b,
+  0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a, 0x29,
+  0x29, 0x20, 0x28, 0x22, 0x2e, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d,
+  0x2b, 0x29, 0x3f, 0x20, 0x28, 0x5b, 0x65, 0x45, 0x5d, 0x20, 0x5b, 0x2d,
+  0x2b, 0x5d, 0x3f, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2b, 0x29, 0x3f,
+  0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x69,
+  0x6e, 0x74, 0x65, 0x67, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d,
+  0x22, 0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20,
+  0x5b, 0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a,
+  0x29, 0x29, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20,
+  0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x60, 0x20, 0x22,
+  0x5c, 0x5c, 0x22, 0x22, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x5b, 0x5e, 0x22, 0x5c, 0x5c, 0x5c, 0x5c, 0x5d, 0x20,
+  0x7c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x22, 0x5c,
+  0x5c, 0x5c, 0x5c, 0x22, 0x20, 0x28, 0x5b, 0x22, 0x5c, 0x5c, 0x5c, 0x5c,
+  0x2f, 0x62, 0x66, 0x6e, 0x72, 0x74, 0x5d, 0x20, 0x7c, 0x20, 0x22, 0x75,
+  0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46,
+  0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2a, 0x20,
+  0x22, 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60,
+  0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x20, 0x27, 0x22,
+  0x6e, 0x75, 0x6c, 0x6c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
+  0x2c, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44, 0x5f, 0x52, 0x55, 0x4c, 0x45,
+  0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f, 0x52, 0x45, 0x20, 0x3d, 0x20,
+  0x2f, 0x5b, 0x5e, 0x5c, 0x64, 0x41, 0x2d, 0x5a, 0x61, 0x2d, 0x7a, 0x2d,
+  0x5d, 0x2b, 0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45,
+  0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x52,
+  0x45, 0x20, 0x3d, 0x20, 0x2f, 0x5b, 0x5c, 0x6e, 0x5c, 0x72, 0x22, 0x5d,
+  0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x47, 0x52,
+  0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41,
+  0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x53, 0x20, 0x3d, 0x20,
+  0x7b, 0x27, 0x5c, 0x72, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x72, 0x27,
+  0x2c, 0x20, 0x27, 0x5c, 0x6e, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x6e,
+  0x27, 0x2c, 0x20, 0x27, 0x22, 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x22,
+  0x27, 0x7d, 0x3b, 0x0a, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20,
+  0x63, 0x6c, 0x61, 0x73, 0x73, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
+  0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f,
+  0x72, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x3d,
+  0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x7c,
+  0x7c, 0x20, 0x7b, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x20, 0x3d, 0x20,
+  0x6e, 0x65, 0x77, 0x20, 0x4d, 0x61, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c,
+  0x65, 0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x27, 0x73, 0x70, 0x61, 0x63,
+  0x65, 0x27, 0x2c, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f, 0x52, 0x55,
+  0x4c, 0x45, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
+  0x61, 0x6c, 0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20,
+  0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x4a, 0x53,
+  0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79,
+  0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x2e, 0x72, 0x65,
+  0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54,
+  0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f,
+  0x52, 0x45, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x20,
+  0x3d, 0x3e, 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c,
+  0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50,
+  0x45, 0x53, 0x5b, 0x6d, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x60, 0x22, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x7d,
+  0x22, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f,
+  0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x6e, 0x61, 0x6d, 0x65,
+  0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d,
+  0x65, 0x20, 0x3d, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2e, 0x72, 0x65, 0x70,
+  0x6c, 0x61, 0x63, 0x65, 0x28, 0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44,
+  0x5f, 0x52, 0x55, 0x4c, 0x45, 0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f,
+  0x52, 0x45, 0x2c, 0x20, 0x27, 0x2d, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20,
+  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
+  0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28, 0x65, 0x73,
+  0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x28,
+  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20,
+  0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73,
+  0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x2e, 0x68, 0x61, 0x73, 0x28,
+  0x60, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
+  0x7b, 0x69, 0x7d, 0x60, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x60, 0x24, 0x7b,
+  0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x69, 0x7d,
+  0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65,
+  0x73, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x6b, 0x65, 0x79, 0x2c, 0x20, 0x72,
+  0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20,
+  0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73,
+  0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20,
+  0x3d, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x74, 0x79, 0x70,
+  0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20,
+  0x6e, 0x61, 0x6d, 0x65, 0x20, 0x7c, 0x7c, 0x20, 0x27, 0x72, 0x6f, 0x6f,
+  0x74, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20,
+  0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x6f, 0x6e, 0x65, 0x4f,
+  0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e,
+  0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c,
+  0x65, 0x20, 0x3d, 0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e,
+  0x6f, 0x6e, 0x65, 0x4f, 0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68,
+  0x65, 0x6d, 0x61, 0x2e, 0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x2e, 0x6d,
+  0x61, 0x70, 0x28, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69,
+  0x73, 0x69, 0x74, 0x28, 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
+  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20,
+  0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x69, 0x7d, 0x60, 0x29, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e,
+  0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74,
+  0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65,
+  0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x72,
+  0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20,
+  0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c,
+  0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
+  0x61, 0x6c, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x63, 0x6f,
+  0x6e, 0x73, 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x65,
+  0x6e, 0x75, 0x6d, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d,
+  0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x65, 0x6e, 0x75, 0x6d,
+  0x2e, 0x6d, 0x61, 0x70, 0x28, 0x76, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x68,
+  0x69, 0x73, 0x2e, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69,
+  0x74, 0x65, 0x72, 0x61, 0x6c, 0x28, 0x76, 0x29, 0x29, 0x2e, 0x6a, 0x6f,
+  0x69, 0x6e, 0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c,
+  0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20,
+  0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d,
+  0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x63,
+  0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x27, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x27, 0x20, 0x26, 0x26,
+  0x20, 0x27, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73,
+  0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20,
+  0x54, 0x4f, 0x44, 0x4f, 0x3a, 0x20, 0x60, 0x72, 0x65, 0x71, 0x75, 0x69,
+  0x72, 0x65, 0x64, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64,
+  0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f,
+  0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72,
+  0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f,
+  0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70,
+  0x72, 0x6f, 0x70, 0x50, 0x61, 0x69, 0x72, 0x73, 0x20, 0x3d, 0x20, 0x4f,
+  0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x65, 0x6e, 0x74, 0x72, 0x69, 0x65,
+  0x73, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x70, 0x72, 0x6f,
+  0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x29, 0x2e, 0x73, 0x6f, 0x72,
+  0x74, 0x28, 0x28, 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x3d, 0x3e, 0x20,
+  0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f,
+  0x20, 0x73, 0x6f, 0x72, 0x74, 0x20, 0x62, 0x79, 0x20, 0x70, 0x6f, 0x73,
+  0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f,
+  0x70, 0x5f, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x28, 0x69, 0x66, 0x20,
+  0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, 0x65, 0x64, 0x29, 0x20, 0x74,
+  0x68, 0x65, 0x6e, 0x20, 0x62, 0x79, 0x20, 0x6b, 0x65, 0x79, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x3d, 0x20, 0x74, 0x79,
+  0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64,
+  0x65, 0x72, 0x5b, 0x61, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3d, 0x3d, 0x3d,
+  0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x27, 0x20, 0x3f, 0x20,
+  0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x61, 0x5b,
+  0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49, 0x6e, 0x66, 0x69, 0x6e, 0x69,
+  0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x42,
+  0x20, 0x3d, 0x20, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x70, 0x72,
+  0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d,
+  0x5d, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x6e, 0x75, 0x6d, 0x62, 0x65,
+  0x72, 0x27, 0x20, 0x3f, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64,
+  0x65, 0x72, 0x5b, 0x62, 0x5b, 0x30, 0x5d, 0x5d, 0x20, 0x3a, 0x20, 0x49,
+  0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20,
+  0x6f, 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x2d, 0x20, 0x6f, 0x72, 0x64,
+  0x65, 0x72, 0x42, 0x20, 0x7c, 0x7c, 0x20, 0x61, 0x5b, 0x30, 0x5d, 0x2e,
+  0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x61, 0x72,
+  0x65, 0x28, 0x62, 0x5b, 0x30, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d,
+  0x20, 0x27, 0x22, 0x7b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x70,
+  0x50, 0x61, 0x69, 0x72, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63,
+  0x68, 0x28, 0x28, 0x5b, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65,
+  0x2c, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61,
+  0x5d, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74,
+  0x20, 0x70, 0x72, 0x6f, 0x70, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d,
+  0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69, 0x73,
+  0x69, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24,
+  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20,
+  0x3a, 0x20, 0x22, 0x22, 0x7d, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x4e,
+  0x61, 0x6d, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3e, 0x20,
+  0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x27,
+  0x20, 0x22, 0x2c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20,
+  0x2b, 0x3d, 0x20, 0x60, 0x20, 0x24, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e,
+  0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72,
+  0x61, 0x6c, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x29,
+  0x7d, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x22, 0x3a, 0x22, 0x20,
+  0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70,
+  0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x60, 0x3b, 0x0a,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20,
+  0x27, 0x20, 0x22, 0x7d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27,
+  0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74,
+  0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64,
+  0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61,
+  0x6d, 0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66,
+  0x20, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65,
+  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x61, 0x72, 0x72, 0x61, 0x79, 0x27,
+  0x20, 0x26, 0x26, 0x20, 0x27, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x27, 0x20,
+  0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x20, 0x7b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x4f,
+  0x44, 0x4f, 0x20, 0x60, 0x70, 0x72, 0x65, 0x66, 0x69, 0x78, 0x49, 0x74,
+  0x65, 0x6d, 0x73, 0x60, 0x20, 0x6b, 0x65, 0x79, 0x77, 0x6f, 0x72, 0x64,
+  0x20, 0x28, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x70, 0x79, 0x74, 0x68, 0x6f,
+  0x6e, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61,
+  0x74, 0x69, 0x6f, 0x6e, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75,
+  0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69,
+  0x73, 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x2e, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x2c, 0x20, 0x60, 0x24,
+  0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65,
+  0x20, 0x3f, 0x20, 0x22, 0x2d, 0x22, 0x20, 0x3a, 0x20, 0x22, 0x22, 0x7d,
+  0x69, 0x74, 0x65, 0x6d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65,
+  0x20, 0x3d, 0x20, 0x60, 0x22, 0x5b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63,
+  0x65, 0x20, 0x28, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75, 0x6c,
+  0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x28, 0x22, 0x2c, 0x22, 0x20,
+  0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d,
+  0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x29, 0x2a, 0x29,
+  0x3f, 0x20, 0x22, 0x5d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60,
+  0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75,
+  0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64,
+  0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d,
+  0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x50, 0x52,
+  0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45,
+  0x53, 0x5b, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65,
+  0x5d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45,
+  0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x55, 0x6e, 0x72, 0x65, 0x63, 0x6f,
+  0x67, 0x6e, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d,
+  0x61, 0x3a, 0x20, 0x24, 0x7b, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74,
+  0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x73, 0x63, 0x68, 0x65,
+  0x6d, 0x61, 0x29, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65,
+  0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61,
+  0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65,
+  0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20,
+  0x3f, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20, 0x3a, 0x20, 0x73,
+  0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x2c, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x50, 0x52, 0x49, 0x4d, 0x49,
+  0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x53, 0x5b, 0x73,
+  0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x5d, 0x0a, 0x20,
+  0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20,
+  0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72,
+  0x6d, 0x61, 0x74, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x28, 0x29,
+  0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x67,
+  0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72,
+  0x75, 0x6c, 0x65, 0x73, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68,
+  0x28, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65,
+  0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20,
+  0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x2b, 0x3d, 0x20,
+  0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x3a, 0x3a, 0x3d,
+  0x20, 0x24, 0x7b, 0x72, 0x75, 0x6c, 0x65, 0x7d, 0x5c, 0x6e, 0x60, 0x3b,
+  0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20,
+  0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x67, 0x72, 0x61, 0x6d,
+  0x6d, 0x61, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a
+};
+unsigned int json_schema_to_grammar_mjs_len = 3695;
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -141,6 +141,7 @@
    } from '/index.js';

    import { llama } from '/completion.js';
+    import { SchemaConverter } from '/json-schema-to-grammar.mjs';

    const session = signal({
      prompt: "This is a conversation between user and llama, a friendly chatbot. respond in simple markdown.",
@ -166,6 +167,7 @@
      mirostat: 0, // 0/1/2
      mirostat_tau: 5, // target entropy
      mirostat_eta: 0.1, // learning rate
+      grammar: null,
    })

    const llamaStats = signal(null)
@ -304,6 +306,26 @@
      const updateParamsFloat = (el) => params.value = { ...params.value, [el.target.name]: parseFloat(el.target.value) }
      const updateParamsInt = (el) => params.value = { ...params.value, [el.target.name]: Math.floor(parseFloat(el.target.value)) }

+      const grammarJsonSchemaPropOrder = signal('')
+      const updateGrammarJsonSchemaPropOrder = (el) => grammarJsonSchemaPropOrder.value = el.target.value
+      const convertJSONSchemaGrammar = () => {
+        try {
+          const schema = JSON.parse(params.value.grammar)
+          const converter = new SchemaConverter(
+            grammarJsonSchemaPropOrder.value
+              .split(',')
+              .reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {})
+          )
+          converter.visit(schema, '')
+          params.value = {
+            ...params.value,
+            grammar: converter.formatGrammar(),
+          }
+        } catch (e) {
+          alert(`Convert failed: ${e.message}`)
+        }
+      }
+
      const FloatField = ({label, max, min, name, step, value}) => {
        return html`
          <div>
@ -355,6 +377,13 @@
              <label for="template">Chat history template</label>
              <textarea id="template" name="historyTemplate" value="${session.value.historyTemplate}" rows=1 oninput=${updateSession}/>
            </div>
+
+            <div>
+              <label for="template">Grammar</label>
+              <textarea id="grammar" name="grammar" placeholder="Use gbnf or JSON Schema+convert" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+              <input type="text" name="prop-order" placeholder="order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
+              <button type="button" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
+            </div>
          </fieldset>

          <fieldset class="two">
--- a/examples/server/public/index.js
+++ b/examples/server/public/index.js
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@ -0,0 +1,112 @@
+const SPACE_RULE = '" "?';
+
+const PRIMITIVE_RULES = {
+  boolean: '("true" | "false") space',
+  number: '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
+  integer: '("-"? ([0-9] | [1-9] [0-9]*)) space',
+  string: ` "\\"" (
+        [^"\\\\] |
+        "\\\\" (["\\\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+      )* "\\"" space`,
+  null: '"null" space',
+};
+
+const INVALID_RULE_CHARS_RE = /[^\dA-Za-z-]+/g;
+const GRAMMAR_LITERAL_ESCAPE_RE = /[\n\r"]/g;
+const GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'};
+
+export class SchemaConverter {
+  constructor(propOrder) {
+    this._propOrder = propOrder || {};
+    this._rules = new Map();
+    this._rules.set('space', SPACE_RULE);
+  }
+
+  _formatLiteral(literal) {
+    const escaped = JSON.stringify(literal).replace(
+      GRAMMAR_LITERAL_ESCAPE_RE,
+      m => GRAMMAR_LITERAL_ESCAPES[m]
+    );
+    return `"${escaped}"`;
+  }
+
+  _addRule(name, rule) {
+    let escName = name.replace(INVALID_RULE_CHARS_RE, '-');
+    let key = escName;
+
+    if (this._rules.has(escName)) {
+      if (this._rules.get(escName) === rule) {
+        return key;
+      }
+
+      let i = 0;
+      while (this._rules.has(`${escName}${i}`)) {
+        i += 1;
+      }
+      key = `${escName}${i}`;
+    }
+
+    this._rules.set(key, rule);
+    return key;
+  }
+
+  visit(schema, name) {
+    const schemaType = schema.type;
+    const ruleName = name || 'root';
+
+    if (schema.oneOf || schema.anyOf) {
+      const rule = (schema.oneOf || schema.anyOf).map((altSchema, i) =>
+        this.visit(altSchema, `${name}${name ? "-" : ""}${i}`)
+      ).join(' | ');
+
+      return this._addRule(ruleName, rule);
+    } else if ('const' in schema) {
+      return this._addRule(ruleName, this._formatLiteral(schema.const));
+    } else if ('enum' in schema) {
+      const rule = schema.enum.map(v => this._formatLiteral(v)).join(' | ');
+      return this._addRule(ruleName, rule);
+    } else if (schemaType === 'object' && 'properties' in schema) {
+      // TODO: `required` keyword (from python implementation)
+      const propOrder = this._propOrder;
+      const propPairs = Object.entries(schema.properties).sort((a, b) => {
+        // sort by position in prop_order (if specified) then by key
+        const orderA = typeof propOrder[a[0]] === 'number' ? propOrder[a[0]] : Infinity;
+        const orderB = typeof propOrder[b[0]] === 'number' ? propOrder[b[0]] : Infinity;
+        return orderA - orderB || a[0].localeCompare(b[0]);
+      });
+
+      let rule = '"{" space';
+      propPairs.forEach(([propName, propSchema], i) => {
+        const propRuleName = this.visit(propSchema, `${name}${name ? "-" : ""}${propName}`);
+        if (i > 0) {
+          rule += ' "," space';
+        }
+        rule += ` ${this._formatLiteral(propName)} space ":" space ${propRuleName}`;
+      });
+      rule += ' "}" space';
+
+      return this._addRule(ruleName, rule);
+    } else if (schemaType === 'array' && 'items' in schema) {
+      // TODO `prefixItems` keyword (from python implementation)
+      const itemRuleName = this.visit(schema.items, `${name}${name ? "-" : ""}item`);
+      const rule = `"[" space (${itemRuleName} ("," space ${itemRuleName})*)? "]" space`;
+      return this._addRule(ruleName, rule);
+    } else {
+      if (!PRIMITIVE_RULES[schemaType]) {
+        throw new Error(`Unrecognized schema: ${JSON.stringify(schema)}`);
+      }
+      return this._addRule(
+        ruleName === 'root' ? 'root' : schemaType,
+        PRIMITIVE_RULES[schemaType]
+      );
+    }
+  }
+
+  formatGrammar() {
+    let grammar = '';
+    this._rules.forEach((rule, name) => {
+      grammar += `${name} ::= ${rule}\n`;
+    });
+    return grammar;
+  }
+}
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -196,6 +196,7 @@ struct llama_server_context
    llama_context *ctx = nullptr;
    gpt_params params;

+    grammar_parser::parse_state parsed_grammar;
    llama_grammar *grammar = nullptr;

    bool truncated = false;
@ -241,10 +242,13 @@ struct llama_server_context
        stopped_limit = false;
        stopping_word = "";
        multibyte_pending = 0;
-        grammar = nullptr;
-
        n_remain = 0;
        n_past = 0;
+
+        if (grammar != nullptr) {
+            llama_grammar_free(grammar);
+            grammar = nullptr;
+        }
    }

    bool loadModel(const gpt_params &params_)
@ -265,8 +269,6 @@ struct llama_server_context
    bool loadGrammar()
    {
        if (!params.grammar.empty()) {
-            grammar_parser::parse_state parsed_grammar;
-
            parsed_grammar = grammar_parser::parse(params.grammar.c_str());
            // will be empty (default) if there are parse errors
            if (parsed_grammar.rules.empty()) {
@ -1006,7 +1008,7 @@ static json format_timings(llama_server_context &llama)
    assert(timings.n_eval == llama.num_tokens_predicted);

    return json{
-        {"prompt_n", timings.n_eval},
+        {"prompt_n", timings.n_p_eval},
        {"prompt_ms", timings.t_p_eval_ms},
        {"prompt_per_token_ms", timings.t_p_eval_ms / timings.n_p_eval},
        {"prompt_per_second", 1e3 / timings.t_p_eval_ms * timings.n_p_eval},
@ -1035,7 +1037,6 @@ static json format_final_response(llama_server_context &llama, const std::string
        {"stopped_limit", llama.stopped_limit},
        {"stopping_word", llama.stopping_word},
        {"tokens_cached", llama.n_past},
-        {"tokens_predicted", llama.num_tokens_predicted},
        {"timings", format_timings(llama)},
    };

--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/llama-util.h
+++ b/llama-util.h
@ -271,20 +271,29 @@ struct llama_mmap {
            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
        }

-        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
        if (prefetch) {
-            // Advise the kernel to preload the mapped memory
+            // The PrefetchVirtualMemory API is only present on Windows 8 and above, so we
+            // will dynamically load it using GetProcAddress.
+            BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
+            HMODULE hKernel32;
+
+            // This call is guaranteed to succeed.
+            hKernel32 = GetModuleHandleW(L"kernel32.dll");
+
+            // This call may fail if on a pre-Win8 system.
+            pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
+
+            if (pPrefetchVirtualMemory) {
+                // Advise the kernel to preload the mapped memory.
                WIN32_MEMORY_RANGE_ENTRY range;
                range.VirtualAddress = addr;
                range.NumberOfBytes = (SIZE_T)size;
-            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+                if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
                    fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
                            llama_format_win_err(GetLastError()).c_str());
                }
            }
-        #else
-        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
-        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
+        }
    }

    ~llama_mmap() {
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -11,5 +11,6 @@ llama_add_test(test-quantize-fns.cpp)
 llama_add_test(test-quantize-perf.cpp)
 llama_add_test(test-sampling.cpp)
 llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)
+llama_add_test(test-grammar-parser.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../examples/grammar-parser.cpp)
 llama_add_test(test-grad0.cpp) # SLOW
 # llama_add_test(test-opt.cpp) # SLOW
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@ -0,0 +1,249 @@
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+
+#include "llama.h"
+#include "examples/grammar-parser.cpp"
+#include <cassert>
+
+int main()
+{
+    grammar_parser::parse_state parsed_grammar;
+
+    const char *grammar_bytes = R"""(root  ::= (expr "=" term "\n")+
+expr  ::= term ([-+*/] term)*
+term  ::= [0-9]+)""";
+
+    parsed_grammar = grammar_parser::parse(grammar_bytes);
+
+    std::vector<std::pair<std::string, uint32_t>> expected = {
+        {"expr", 2},
+        {"expr_5", 5},
+        {"expr_6", 6},
+        {"root", 0},
+        {"root_1", 1},
+        {"root_4", 4},
+        {"term", 3},
+        {"term_7", 7},
+    };
+
+    uint32_t index = 0;
+    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
+    {
+        std::string key = it->first;
+        uint32_t value = it->second;
+        std::pair<std::string, uint32_t> expected_pair = expected[index];
+
+        // pretty print error message before asserting
+        if (expected_pair.first != key || expected_pair.second != value)
+        {
+            fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
+            fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
+            fprintf(stderr, "expected_pair != actual_pair\n");
+        }
+
+        assert(expected_pair.first == key && expected_pair.second == value);
+
+        index++;
+    }
+    std::vector<llama_grammar_element> expected_rules = {
+        {LLAMA_GRETYPE_RULE_REF, 4},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_RULE_REF, 2},
+        {LLAMA_GRETYPE_CHAR, 61},
+        {LLAMA_GRETYPE_RULE_REF, 3},
+        {LLAMA_GRETYPE_CHAR, 10},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_RULE_REF, 3},
+        {LLAMA_GRETYPE_RULE_REF, 6},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_RULE_REF, 7},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_RULE_REF, 1},
+        {LLAMA_GRETYPE_RULE_REF, 4},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_RULE_REF, 1},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_CHAR, 45},
+        {LLAMA_GRETYPE_CHAR_ALT, 43},
+        {LLAMA_GRETYPE_CHAR_ALT, 42},
+        {LLAMA_GRETYPE_CHAR_ALT, 47},
+        {LLAMA_GRETYPE_RULE_REF, 3},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_RULE_REF, 5},
+        {LLAMA_GRETYPE_RULE_REF, 6},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_CHAR, 48},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+        {LLAMA_GRETYPE_RULE_REF, 7},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR, 48},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+        {LLAMA_GRETYPE_END, 0},
+    };
+
+    index = 0;
+    for (auto rule : parsed_grammar.rules)
+    {
+        // compare rule to expected rule
+        for (uint32_t i = 0; i < rule.size(); i++)
+        {
+            llama_grammar_element element = rule[i];
+            llama_grammar_element expected_element = expected_rules[index];
+
+            // pretty print error message before asserting
+            if (expected_element.type != element.type || expected_element.value != element.value)
+            {
+                fprintf(stderr, "index: %d\n", index);
+                fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
+                fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
+                fprintf(stderr, "expected_element != actual_element\n");
+            }
+
+            assert(expected_element.type == element.type && expected_element.value == element.value);
+            index++;
+        }
+    }
+
+    const char *longer_grammar_bytes = R"""(
+    root  ::= (expr "=" ws term "\n")+
+    expr  ::= term ([-+*/] term)*
+    term  ::= ident | num | "(" ws expr ")" ws
+    ident ::= [a-z] [a-z0-9_]* ws
+    num   ::= [0-9]+ ws
+    ws    ::= [ \t\n]*
+    )""";
+
+    parsed_grammar = grammar_parser::parse(longer_grammar_bytes);
+
+    expected = {
+        {"expr", 2},
+        {"expr_6", 6},
+        {"expr_7", 7},
+        {"ident", 8},
+        {"ident_10", 10},
+        {"num", 9},
+        {"num_11", 11},
+        {"root", 0},
+        {"root_1", 1},
+        {"root_5", 5},
+        {"term", 4},
+        {"ws", 3},
+        {"ws_12", 12},
+    };
+
+    index = 0;
+    for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
+    {
+        std::string key = it->first;
+        uint32_t value = it->second;
+        std::pair<std::string, uint32_t> expected_pair = expected[index];
+
+        // pretty print error message before asserting
+        if (expected_pair.first != key || expected_pair.second != value)
+        {
+            fprintf(stderr, "expected_pair: %s, %d\n", expected_pair.first.c_str(), expected_pair.second);
+            fprintf(stderr, "actual_pair: %s, %d\n", key.c_str(), value);
+            fprintf(stderr, "expected_pair != actual_pair\n");
+        }
+
+        assert(expected_pair.first == key && expected_pair.second == value);
+
+        index++;
+    }
+    expected_rules = {
+        {LLAMA_GRETYPE_RULE_REF, 5},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_RULE_REF, 2},
+        {LLAMA_GRETYPE_CHAR, 61},
+        {LLAMA_GRETYPE_RULE_REF, 3},
+        {LLAMA_GRETYPE_RULE_REF, 4},
+        {LLAMA_GRETYPE_CHAR, 10},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_RULE_REF, 4},
+        {LLAMA_GRETYPE_RULE_REF, 7},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_RULE_REF, 12},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_RULE_REF, 8},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_RULE_REF, 9},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR, 40},
+        {LLAMA_GRETYPE_RULE_REF, 3},
+        {LLAMA_GRETYPE_RULE_REF, 2},
+        {LLAMA_GRETYPE_CHAR, 41},
+        {LLAMA_GRETYPE_RULE_REF, 3},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_RULE_REF, 1},
+        {LLAMA_GRETYPE_RULE_REF, 5},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_RULE_REF, 1},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_CHAR, 45},
+        {LLAMA_GRETYPE_CHAR_ALT, 43},
+        {LLAMA_GRETYPE_CHAR_ALT, 42},
+        {LLAMA_GRETYPE_CHAR_ALT, 47},
+        {LLAMA_GRETYPE_RULE_REF, 4},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_RULE_REF, 6},
+        {LLAMA_GRETYPE_RULE_REF, 7},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_CHAR, 97},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
+        {LLAMA_GRETYPE_RULE_REF, 10},
+        {LLAMA_GRETYPE_RULE_REF, 3},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_RULE_REF, 11},
+        {LLAMA_GRETYPE_RULE_REF, 3},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_CHAR, 97},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
+        {LLAMA_GRETYPE_CHAR_ALT, 48},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+        {LLAMA_GRETYPE_CHAR_ALT, 95},
+        {LLAMA_GRETYPE_RULE_REF, 10},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_CHAR, 48},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+        {LLAMA_GRETYPE_RULE_REF, 11},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_CHAR, 48},
+        {LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
+        {LLAMA_GRETYPE_END, 0},
+        {LLAMA_GRETYPE_CHAR, 32},
+        {LLAMA_GRETYPE_CHAR_ALT, 9},
+        {LLAMA_GRETYPE_CHAR_ALT, 10},
+        {LLAMA_GRETYPE_RULE_REF, 12},
+        {LLAMA_GRETYPE_ALT, 0},
+        {LLAMA_GRETYPE_END, 0},
+    };
+
+    index = 0;
+    for (auto rule : parsed_grammar.rules)
+    {
+        // compare rule to expected rule
+        for (uint32_t i = 0; i < rule.size(); i++)
+        {
+            llama_grammar_element element = rule[i];
+            llama_grammar_element expected_element = expected_rules[index];
+
+            // pretty print error message before asserting
+            if (expected_element.type != element.type || expected_element.value != element.value)
+            {
+                fprintf(stderr, "index: %d\n", index);
+                fprintf(stderr, "expected_element: %d, %d\n", expected_element.type, expected_element.value);
+                fprintf(stderr, "actual_element: %d, %d\n", element.type, element.value);
+                fprintf(stderr, "expected_element != actual_element\n");
+            }
+
+            assert(expected_element.type == element.type && expected_element.value == element.value);
+            index++;
+        }
+    }
+
+    return 0;
+}