Merge branch 'master' into compilade/refactor-kv-cache

2024-06-30 15:31:25 -04:00 · 2024-06-30 15:31:25 -04:00 · 10c3c419e9
commit 10c3c419e9
parent 33425a7e1e 9ef0780062
518 changed files with 78202 additions and 66427 deletions
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -1,7 +1,14 @@
-set(TARGET server)
+set(TARGET llama-server)
 option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
+option(LLAMA_SERVER_SSL     "Build SSL support for the server"        OFF)
+
 include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+
+if (MINGW)
+    # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
+    add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
+endif()
+
 set(TARGET_SRCS
    server.cpp
    utils.hpp
@ -24,6 +31,7 @@ set(PUBLIC_ASSETS
    prompt-formats.js
    json-schema-to-grammar.mjs
 )
+
 foreach(asset ${PUBLIC_ASSETS})
    set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
    set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
@ -34,18 +42,23 @@ foreach(asset ${PUBLIC_ASSETS})
        COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
    )
 endforeach()
+
 add_executable(${TARGET} ${TARGET_SRCS})
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
+
 target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
+
 if (LLAMA_SERVER_SSL)
    find_package(OpenSSL REQUIRED)
    target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
    target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
 endif()
+
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
+
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -73,6 +73,7 @@ The project is under active development, and we are [looking for feedback and co
 - `-fa`, `--flash-attn` : enable flash attention (default: disabled).
 - `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
 - `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
+- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.

 **If compiled with `LLAMA_SERVER_SSL=ON`**
 - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
@ -80,26 +81,26 @@ The project is under active development, and we are [looking for feedback and co

 ## Build

-`server` is built alongside everything else from the root of the project
+`llama-server` is built alongside everything else from the root of the project

 - Using `make`:

  ```bash
-  make server
+  make llama-server
  ```

 - Using `CMake`:

  ```bash
  cmake -B build
-  cmake --build build --config Release -t server
+  cmake --build build --config Release -t llama-server
  ```

-  Binary is at `./build/bin/server`
+  Binary is at `./build/bin/llama-server`

 ## Build with SSL

-`server` can also be built with SSL support using OpenSSL 3
+`llama-server` can also be built with SSL support using OpenSSL 3

 - Using `make`:

@ -107,14 +108,14 @@ The project is under active development, and we are [looking for feedback and co
  # NOTE: For non-system openssl, use the following:
  #   CXXFLAGS="-I /path/to/openssl/include"
  #   LDFLAGS="-L /path/to/openssl/lib"
-  make LLAMA_SERVER_SSL=true server
+  make LLAMA_SERVER_SSL=true llama-server
  ```

 - Using `CMake`:

  ```bash
  cmake -B build -DLLAMA_SERVER_SSL=ON
-  cmake --build build --config Release -t server
+  cmake --build build --config Release -t llama-server
  ```

 ## Quick Start
@ -124,13 +125,13 @@ To get started right away, run the following command, making sure to use the cor
 ### Unix-based systems (Linux, macOS, etc.)

 ```bash
-./server -m models/7B/ggml-model.gguf -c 2048
+./llama-server -m models/7B/ggml-model.gguf -c 2048
 ```

 ### Windows

 ```powershell
-server.exe -m models\7B\ggml-model.gguf -c 2048
+llama-server.exe -m models\7B\ggml-model.gguf -c 2048
 ```

 The above command will start a server that by default listens on `127.0.0.1:8080`.
@ -629,11 +630,11 @@ bash chat.sh

 ### OAI-like API

-The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi
+The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi

 ### API errors

-`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
+`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi

 Example of an error:

--- a/examples/server/bench/README.md
+++ b/examples/server/bench/README.md
@ -99,7 +99,7 @@ The `bench.py` script does several steps:
 It aims to be used in the CI, but you can run it manually:

 ```shell
-LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
+LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \
              --runner-label local \
              --name local \
              --branch `git rev-parse --abbrev-ref HEAD` \
--- a/examples/server/bench/bench.py
+++ b/examples/server/bench/bench.py
@ -245,7 +245,7 @@ def start_server(args):

 def start_server_background(args):
    # Start the server
-    server_path = '../../../build/bin/server'
+    server_path = '../../../build/bin/llama-server'
    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
        server_path = os.environ['LLAMA_SERVER_BIN_PATH']
    server_args = [
--- a/examples/server/public/index-new.html
+++ b/examples/server/public/index-new.html
@ -634,12 +634,12 @@ return html`
      <div>
        <div class="grammar">
          <label for="template"></label>
-          <textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON-Scheme + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
+          <textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
        </div>
        <div class="grammar-columns">
          <div class="json-schema-controls">
            <input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
-            <button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON-Scheme</button>
+            <button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
          </div>
        </div>
      </div>
--- a/examples/server/public/json-schema-to-grammar.mjs
+++ b/examples/server/public/json-schema-to-grammar.mjs
@ -24,6 +24,201 @@ function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
  return minItems === 0 ? `(${result})?` : result;
 }

+function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) {
+  const hasMin = minValue !== null;
+  const hasMax = maxValue !== null;
+
+  function digitRange(fromChar, toChar) {
+      out.push("[");
+      if (fromChar === toChar) {
+          out.push(fromChar);
+      } else {
+          out.push(fromChar);
+          out.push("-");
+          out.push(toChar);
+      }
+      out.push("]");
+  }
+
+  function moreDigits(minDigits, maxDigits) {
+      out.push("[0-9]");
+      if (minDigits === maxDigits && minDigits === 1) {
+          return;
+      }
+      out.push("{");
+      out.push(minDigits.toString());
+      if (maxDigits !== minDigits) {
+          out.push(",");
+          if (maxDigits !== Number.MAX_SAFE_INTEGER) {
+              out.push(maxDigits.toString());
+          }
+      }
+      out.push("}");
+  }
+
+  function uniformRange(fromStr, toStr) {
+      let i = 0;
+      while (i < fromStr.length && fromStr[i] === toStr[i]) {
+          i++;
+      }
+      if (i > 0) {
+          out.push("\"");
+          out.push(fromStr.slice(0, i));
+          out.push("\"");
+      }
+      if (i < fromStr.length) {
+          if (i > 0) {
+              out.push(" ");
+          }
+          const subLen = fromStr.length - i - 1;
+          if (subLen > 0) {
+              const fromSub = fromStr.slice(i + 1);
+              const toSub = toStr.slice(i + 1);
+              const subZeros = "0".repeat(subLen);
+              const subNines = "9".repeat(subLen);
+
+              let toReached = false;
+              out.push("(");
+              if (fromSub === subZeros) {
+                  digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1));
+                  out.push(" ");
+                  moreDigits(subLen, subLen);
+              } else {
+                  out.push("[");
+                  out.push(fromStr[i]);
+                  out.push("] ");
+                  out.push("(");
+                  uniformRange(fromSub, subNines);
+                  out.push(")");
+                  if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) {
+                      out.push(" | ");
+                      if (toSub === subNines) {
+                          digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]);
+                          toReached = true;
+                      } else {
+                          digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1));
+                      }
+                      out.push(" ");
+                      moreDigits(subLen, subLen);
+                  }
+              }
+              if (!toReached) {
+                  out.push(" | ");
+                  digitRange(toStr[i], toStr[i]);
+                  out.push(" ");
+                  uniformRange(subZeros, toSub);
+              }
+              out.push(")");
+          } else {
+              out.push("[");
+              out.push(fromStr[i]);
+              out.push("-");
+              out.push(toStr[i]);
+              out.push("]");
+          }
+      }
+  }
+
+  if (hasMin && hasMax) {
+      if (minValue < 0 && maxValue < 0) {
+          out.push("\"-\" (");
+          _generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true);
+          out.push(")");
+          return;
+      }
+
+      if (minValue < 0) {
+          out.push("\"-\" (");
+          _generateMinMaxInt(0, -minValue, out, decimalsLeft, true);
+          out.push(") | ");
+          minValue = 0;
+      }
+
+      let minS = minValue.toString();
+      const maxS = maxValue.toString();
+      const minDigits = minS.length;
+      const maxDigits = maxS.length;
+
+      for (let digits = minDigits; digits < maxDigits; digits++) {
+          uniformRange(minS, "9".repeat(digits));
+          minS = "1" + "0".repeat(digits);
+          out.push(" | ");
+      }
+      uniformRange(minS, maxS);
+      return;
+  }
+
+  const lessDecimals = Math.max(decimalsLeft - 1, 1);
+
+  if (hasMin) {
+      if (minValue < 0) {
+          out.push("\"-\" (");
+          _generateMinMaxInt(null, -minValue, out, decimalsLeft, false);
+          out.push(") | [0] | [1-9] ");
+          moreDigits(0, decimalsLeft - 1);
+      } else if (minValue === 0) {
+          if (topLevel) {
+              out.push("[0] | [1-9] ");
+              moreDigits(0, lessDecimals);
+          } else {
+              moreDigits(1, decimalsLeft);
+          }
+      } else if (minValue <= 9) {
+          const c = minValue.toString();
+          const range_start = topLevel ? '1' : '0';
+          if (c > range_start) {
+              digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1));
+              out.push(" ");
+              moreDigits(1, lessDecimals);
+              out.push(" | ");
+          }
+          digitRange(c, "9");
+          out.push(" ");
+          moreDigits(0, lessDecimals);
+      } else {
+          const minS = minValue.toString();
+          const length = minS.length;
+          const c = minS[0];
+
+          if (c > "1") {
+              digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1));
+              out.push(" ");
+              moreDigits(length, lessDecimals);
+              out.push(" | ");
+          }
+          digitRange(c, c);
+          out.push(" (");
+          _generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false);
+          out.push(")");
+          if (c < "9") {
+              out.push(" | ");
+              digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9");
+              out.push(" ");
+              moreDigits(length - 1, lessDecimals);
+          }
+      }
+      return;
+  }
+
+  if (hasMax) {
+      if (maxValue >= 0) {
+          if (topLevel) {
+              out.push("\"-\" [1-9] ");
+              moreDigits(0, lessDecimals);
+              out.push(" | ");
+          }
+          _generateMinMaxInt(0, maxValue, out, decimalsLeft, true);
+      } else {
+          out.push("\"-\" (");
+          _generateMinMaxInt(-maxValue, null, out, decimalsLeft, false);
+          out.push(")");
+      }
+      return;
+  }
+
+  throw new Error("At least one of minValue or maxValue must be set");
+}
+
 class BuiltinRule {
  constructor(content, deps) {
    this.content = content;
@ -64,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
 const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };

 const NON_LITERAL_SET = new Set('|.()[]{}*+?');
-const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');
+const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?');

 export class SchemaConverter {
  constructor(options) {
@ -337,6 +532,64 @@ export class SchemaConverter {
    return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
  }

+  _notStrings(strings) {
+    class TrieNode {
+      constructor() {
+        this.children = {};
+        this.isEndOfString = false;
+      }
+
+      insert(str) {
+        let node = this;
+        for (const c of str) {
+          node = node.children[c] = node.children[c] || new TrieNode();
+        }
+        node.isEndOfString = true;
+      }
+    }
+
+    const trie = new TrieNode();
+    for (const s of strings) {
+      trie.insert(s);
+    }
+
+    const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
+    const out = ['["] ( '];
+
+    const visit = (node) => {
+      const rejects = [];
+      let first = true;
+      for (const c of Object.keys(node.children).sort()) {
+        const child = node.children[c];
+        rejects.push(c);
+        if (first) {
+          first = false;
+        } else {
+          out.push(' | ');
+        }
+        out.push(`[${c}]`);
+        if (Object.keys(child.children).length > 0) {
+          out.push(' (');
+          visit(child);
+          out.push(')');
+        } else if (child.isEndOfString) {
+          out.push(` ${charRuleName}+`);
+        }
+      }
+      if (Object.keys(node.children).length > 0) {
+        if (!first) {
+          out.push(' | ');
+        }
+        out.push(`[^"${rejects.join('')}] ${charRuleName}*`);
+      }
+    };
+
+    visit(trie);
+
+    out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`);
+    return out.join('');
+  }
+
  _resolveRef(ref) {
    let refName = ref.split('/').pop();
    if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
@ -363,11 +616,11 @@ export class SchemaConverter {
    } else if (schema.oneOf || schema.anyOf) {
      return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
    } else if (Array.isArray(schemaType)) {
-      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
+      return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({...schema, type: t}))));
    } else if ('const' in schema) {
-      return this._addRule(ruleName, this._generateConstantRule(schema.const));
+      return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space');
    } else if ('enum' in schema) {
-      const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
+      const rule = '(' + schema.enum.map(v => this._generateConstantRule(v)).join(' | ') + ') space';
      return this._addRule(ruleName, rule);
    } else if ((schemaType === undefined || schemaType === 'object') &&
               ('properties' in schema ||
@ -404,7 +657,7 @@ export class SchemaConverter {
        }
      }

-      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, /* additionalProperties= */ false));
+      return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null));
    } else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
      const items = schema.items ?? schema.prefixItems;
      if (Array.isArray(items)) {
@ -435,6 +688,24 @@ export class SchemaConverter {
      const minLen = schema.minLength || 0;
      const maxLen = schema.maxLength;
      return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
+    } else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) {
+      let minValue = null;
+      let maxValue = null;
+      if ('minimum' in schema) {
+        minValue = schema.minimum;
+      } else if ('exclusiveMinimum' in schema) {
+        minValue = schema.exclusiveMinimum + 1;
+      }
+      if ('maximum' in schema) {
+        maxValue = schema.maximum;
+      } else if ('exclusiveMaximum' in schema) {
+        maxValue = schema.exclusiveMaximum - 1;
+      }
+
+      const out = ["("];
+      _generateMinMaxInt(minValue, maxValue, out);
+      out.push(") space");
+      return this._addRule(ruleName, out.join(''));
    } else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
      return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
    } else {
@ -480,12 +751,19 @@ export class SchemaConverter {
    const requiredProps = sortedProps.filter(k => required.has(k));
    const optionalProps = sortedProps.filter(k => !required.has(k));

-    if (typeof additionalProperties === 'object' || additionalProperties === true) {
+    if (additionalProperties) {
      const subName = `${name ?? ''}${name ? '-' : ''}additional`;
-      const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
+      const valueRule =
+        additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
+        : this._addPrimitive('value', PRIMITIVE_RULES['value']);
+
+      const key_rule =
+        sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string'])
+        : this._addRule(`${subName}-k`, this._notStrings(sortedProps));
+
      propKvRuleNames['*'] = this._addRule(
        `${subName}-kv`,
-        `${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
+        `${key_rule} ":" space ${valueRule}`);
      optionalProps.push('*');
    }

@ -502,15 +780,11 @@ export class SchemaConverter {
        const [k, ...rest] = ks;
        const kvRuleName = propKvRuleNames[k];
        let res;
-        if (k === '*') {
-            res = this._addRule(
-                `${name ?? ''}${name ? '-' : ''}additional-kvs`,
-                `${kvRuleName} ( "," space ` + kvRuleName + ` )*`
-            )
-        } else if (firstIsOptional) {
-          res = `( "," space ${kvRuleName} )?`;
+        const commaRef = `( "," space ${kvRuleName} )`;
+        if (firstIsOptional) {
+          res = commaRef + (k === '*' ? '*' : '?');
        } else {
-          res = kvRuleName;
+          res = kvRuleName + (k === '*' ? ' ' + commaRef + '*' : '');
        }
        if (rest.length > 0) {
          res += ' ' + this._addRule(
--- a/examples/server/public_simplechat/readme.md
+++ b/examples/server/public_simplechat/readme.md
@ -3,6 +3,13 @@

 by Humans for All.

+## quickstart
+
+To run from the build dir
+
+bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
+
+Continue reading for the details.

 ## overview

@ -14,6 +21,8 @@ own system prompts.
 This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
 or potentially as it is being generated, in a streamed manner from the server/ai-model.

+![Chat and Settings screens](./simplechat_screens.webp "Chat and Settings screens")
+
 Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
 open SimpleChat, option is provided to restore the old chat session, if a matching one exists.

@ -44,12 +53,12 @@ http module.

 ### running using examples/server

-bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
+./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT]

 ### running using python3's server module

 first run examples/server
-* bin/server -m path/model.gguf
+* ./llama-server -m path/model.gguf

 next run this web front end in examples/server/public_simplechat
 * cd ../examples/server/public_simplechat
@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t
    The histogram/freq based trimming logic is currently tuned for english language wrt its
    is-it-a-alpabetic|numeral-char regex match logic.

-  chatRequestOptions - maintains the list of options/fields to send along with chat request,
+  apiRequestOptions - maintains the list of options/fields to send along with api request,
  irrespective of whether /chat/completions or /completions endpoint.

    If you want to add additional options/fields to send to the server/ai-model, and or
    modify the existing options value or remove them, for now you can update this global var
    using browser's development-tools/console.

-    For string and numeric fields in chatRequestOptions, including even those added by a user
-    at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
+    For string, numeric and boolean fields in apiRequestOptions, including even those added by a
+    user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
    created.

+    cache_prompt option supported by example/server is allowed to be controlled by user, so that
+    any caching supported wrt system-prompt and chat history, if usable can get used. When chat
+    history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
+    wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
+    However system prompt should ideally get the benefit of caching.
+
  headers - maintains the list of http headers sent when request is made to the server. By default
  Content-Type is set to application/json. Additionally Authorization entry is provided, which can
  be set if needed using the settings ui.
@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t
    >0 : Send the latest chat history from the latest system prompt, limited to specified cnt.


-By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
-implications of loading of the ai-model's context window by chat history, wrt chat response to
-some extent in a simple crude way. You may also want to control the context size enabled when
-the server loads ai-model, on the server end.
+By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
+the implications of loading of the ai-model's context window by chat history, wrt chat response to
+some extent in a simple crude way. You may also want to control the context size enabled when the
+server loads ai-model, on the server end.


 Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side.
  internal n_predict, for now add the same here on the client side, maybe later add max_tokens
  to /completions endpoint handling code on server side.

-NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
-wrt the set of fields sent to server along with the user query. To check how the model behaves
+NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
+wrt the set of fields sent to server along with the user query, to check how the model behaves
 wrt repeatations in general in the generated text response.

 A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
-using the providing settings ui.
+using the provided settings ui (for settings exposed through the ui).


 ### OpenAi / Equivalent API WebService
@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below.
 * the baseUrl in settings ui
  * https://api.openai.com/v1 or similar

-* Wrt request body - gMe.chatRequestOptions
+* Wrt request body - gMe.apiRequestOptions
  * model (settings ui)
  * any additional fields if required in future

--- a/examples/server/public_simplechat/simplechat.js
+++ b/examples/server/public_simplechat/simplechat.js
@ -222,8 +222,8 @@ class SimpleChat {
     * @param {Object} obj
     */
    request_jsonstr_extend(obj) {
-        for(let k in gMe.chatRequestOptions) {
-            obj[k] = gMe.chatRequestOptions[k];
+        for(let k in gMe.apiRequestOptions) {
+            obj[k] = gMe.apiRequestOptions[k];
        }
        if (gMe.bStream) {
            obj["stream"] = true;
@ -740,11 +740,12 @@ class Me {
            "Authorization": "", // Authorization: Bearer OPENAI_API_KEY
        }
        // Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
-        this.chatRequestOptions = {
+        this.apiRequestOptions = {
            "model": "gpt-3.5-turbo",
            "temperature": 0.7,
            "max_tokens": 1024,
            "n_predict": 1024,
+            "cache_prompt": false,
            //"frequency_penalty": 1.2,
            //"presence_penalty": 1.2,
        };
@ -800,51 +801,55 @@ class Me {

            ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);

+            ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
+
+            ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
+
+            ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
+
            ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);

            ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);

-            ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
-
-            ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
-
-            ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
-
        }

-        ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv);
+        ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
        ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);

    }

    /**
-     * Auto create ui input elements for fields in ChatRequestOptions
+     * Auto create ui input elements for fields in apiRequestOptions
     * Currently supports text and number field types.
     * @param {HTMLDivElement} elDiv
     */
-    show_settings_chatrequestoptions(elDiv) {
+    show_settings_apirequestoptions(elDiv) {
        let typeDict = {
            "string": "text",
            "number": "number",
        };
        let fs = document.createElement("fieldset");
        let legend = document.createElement("legend");
-        legend.innerText = "ChatRequestOptions";
+        legend.innerText = "ApiRequestOptions";
        fs.appendChild(legend);
        elDiv.appendChild(fs);
-        for(const k in this.chatRequestOptions) {
-            let val = this.chatRequestOptions[k];
+        for(const k in this.apiRequestOptions) {
+            let val = this.apiRequestOptions[k];
            let type = typeof(val);
-            if (!((type == "string") || (type == "number"))) {
-                continue;
+            if (((type == "string") || (type == "number"))) {
+                let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
+                    if (type == "number") {
+                        val = Number(val);
+                    }
+                    this.apiRequestOptions[k] = val;
+                });
+                fs.appendChild(inp.div);
+            } else if (type == "boolean") {
+                let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
+                    this.apiRequestOptions[k] = userVal;
+                });
+                fs.appendChild(bbtn.div);
            }
-            let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{
-                if (type == "number") {
-                    val = Number(val);
-                }
-                this.chatRequestOptions[k] = val;
-            });
-            fs.appendChild(inp.div);
        }
    }

@ -870,6 +875,23 @@ class Me {
        });
        elDiv.appendChild(bb.div);

+        bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
+            this.bTrimGarbage = val;
+        });
+        elDiv.appendChild(bb.div);
+
+        this.show_settings_apirequestoptions(elDiv);
+
+        let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
+            this.apiEP = ApiEP.Type[val];
+        });
+        elDiv.appendChild(sel.div);
+
+        sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
+            this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
+        });
+        elDiv.appendChild(sel.div);
+
        bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
            this.bCompletionFreshChatAlways = val;
        });
@ -880,23 +902,6 @@ class Me {
        });
        elDiv.appendChild(bb.div);

-        bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
-            this.bTrimGarbage = val;
-        });
-        elDiv.appendChild(bb.div);
-
-        let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
-            this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
-        });
-        elDiv.appendChild(sel.div);
-
-        sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
-            this.apiEP = ApiEP.Type[val];
-        });
-        elDiv.appendChild(sel.div);
-
-        this.show_settings_chatrequestoptions(elDiv);
-
    }

 }
--- a/examples/server/public_simplechat/simplechat_screens.webp
+++ b/examples/server/public_simplechat/simplechat_screens.webp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -1594,7 +1594,7 @@ struct server_context {
                    } else {
                        std::string prompt;
                        if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
-                            json_value(task.data, "prompt", std::string());
+                            prompt = json_value(task.data, "prompt", std::string());
                        }

                        slot = get_available_slot(prompt);
@ -2020,6 +2020,7 @@ struct server_context {
                        slot.t_start_generation = 0;

                        if (slot.infill) {
+                            const bool add_bos = llama_should_add_bos_token(model);
                            bool suff_rm_leading_spc = true;
                            if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
                                params.input_suffix.erase(0, 1);
@ -2035,11 +2036,21 @@ struct server_context {
                            }

                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
-                            prefix_tokens.insert(prefix_tokens.end(),   llama_token_suffix(model));
-                            prefix_tokens.insert(prefix_tokens.end(),   suffix_tokens.begin(), suffix_tokens.end());
-                            prefix_tokens.push_back(llama_token_middle(model));
-                            prompt_tokens = prefix_tokens;
+                            suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
+
+                            auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
+                            auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
+                            if (add_bos) {
+                                embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+                            }
+                            embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+
+                            const llama_token middle_token = llama_token_middle(model);
+                            if (middle_token >= 0) {
+                                embd_inp.push_back(middle_token);
+                            }
+
+                            prompt_tokens = embd_inp;
                        } else {
                            prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
                        }
@ -2606,17 +2617,9 @@ int main(int argc, char ** argv) {

    // print sample chat example to make it clear which template is used
    {
-        json chat;
-        chat.push_back({{"role", "system"},    {"content", "You are a helpful assistant"}});
-        chat.push_back({{"role", "user"},      {"content", "Hello"}});
-        chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
-        chat.push_back({{"role", "user"},      {"content", "How are you?"}});
-
-        const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
-
        LOG_INFO("chat template", {
-            {"chat_example", chat_example},
-            {"built_in", params.chat_template.empty()},
+            {"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
+            {"built_in",     params.chat_template.empty()},
        });
    }

--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@ -27,10 +27,8 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`.

 ```shell
 cd ../../..
-mkdir build
-cd build
-cmake -DLLAMA_CURL=ON ../
-cmake --build . --target server
+cmake -B build -DLLAMA_CURL=ON
+cmake --build build --target llama-server
 ```

 2. Start the test: `./tests.sh`
@ -40,7 +38,7 @@ It's possible to override some scenario steps values with environment variables:
 | variable                 | description                                                                                    |
 |--------------------------|------------------------------------------------------------------------------------------------|
 | `PORT`                   | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
-| `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/server`                         |
+| `LLAMA_SERVER_BIN_PATH`  | to change the server binary path, default: `../../../build/bin/llama-server`                         |
 | `DEBUG`                  | "ON" to enable steps and server verbose mode `--verbose`                                       |
 | `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format                                                       |
 | `N_GPU_LAYERS`           | number of model layers to offload to VRAM `-ngl --n-gpu-layers`                                |
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@ -82,7 +82,7 @@ Feature: llama.cpp server

    Examples: Prompts
      | response_format                                                     | n_predicted | re_content             |
-      | {"type": "json_object", "schema": {"const": "42"}}                  | 5           | "42"                   |
+      | {"type": "json_object", "schema": {"const": "42"}}                  | 6           | "42"                   |
      | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10          | \[ -300 \]             |
      | {"type": "json_object"}                                             | 10          | \{ " Jacky.            |

--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@ -1272,9 +1272,9 @@ def context_text(context):

 def start_server_background(context):
    if os.name == 'nt':
-        context.server_path = '../../../build/bin/Release/server.exe'
+        context.server_path = '../../../build/bin/Release/llama-server.exe'
    else:
-        context.server_path = '../../../build/bin/server'
+        context.server_path = '../../../build/bin/llama-server'
    if 'LLAMA_SERVER_BIN_PATH' in os.environ:
        context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
    server_listen_addr = context.server_fqdn
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin

 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
-    size_t alloc_size = 0;
-    // vector holding all allocated string to be passed to llama_chat_apply_template
-    std::vector<std::string> str(messages.size() * 2);
-    std::vector<llama_chat_message> chat(messages.size());
+    std::vector<llama_chat_msg> chat;

    for (size_t i = 0; i < messages.size(); ++i) {
        const auto & curr_msg = messages[i];
-        str[i*2 + 0]    = json_value(curr_msg, "role",    std::string(""));
-        str[i*2 + 1]    = json_value(curr_msg, "content", std::string(""));
-        alloc_size     += str[i*2 + 1].length();
-        chat[i].role    = str[i*2 + 0].c_str();
-        chat[i].content = str[i*2 + 1].c_str();
+        std::string role    = json_value(curr_msg, "role",    std::string(""));
+        std::string content = json_value(curr_msg, "content", std::string(""));
+        chat.push_back({role, content});
    }

-    const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
-    std::vector<char> buf(alloc_size * 2);
-
-    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-
-    // if it turns out that our buffer is too small, we resize it
-    if ((size_t) res > buf.size()) {
-        buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
-    }
-
-    const std::string formatted_chat(buf.data(), res);
-
+    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
    LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
-
    return formatted_chat;
 }