Merge branch 'master' into compilade/refactor-kv-cache
This commit is contained in:
commit
10c3c419e9
518 changed files with 78202 additions and 66427 deletions
|
@ -1,7 +1,14 @@
|
|||
set(TARGET server)
|
||||
set(TARGET llama-server)
|
||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
||||
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
||||
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
if (MINGW)
|
||||
# fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
|
||||
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
||||
endif()
|
||||
|
||||
set(TARGET_SRCS
|
||||
server.cpp
|
||||
utils.hpp
|
||||
|
@ -24,6 +31,7 @@ set(PUBLIC_ASSETS
|
|||
prompt-formats.js
|
||||
json-schema-to-grammar.mjs
|
||||
)
|
||||
|
||||
foreach(asset ${PUBLIC_ASSETS})
|
||||
set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
|
||||
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
|
||||
|
@ -34,18 +42,23 @@ foreach(asset ${PUBLIC_ASSETS})
|
|||
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
|
||||
)
|
||||
endforeach()
|
||||
|
||||
add_executable(${TARGET} ${TARGET_SRCS})
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||
)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if (LLAMA_SERVER_SSL)
|
||||
find_package(OpenSSL REQUIRED)
|
||||
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
||||
target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||
endif()
|
||||
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
|
|
|
@ -73,6 +73,7 @@ The project is under active development, and we are [looking for feedback and co
|
|||
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
|
||||
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
|
||||
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
|
||||
- `--spm-infill` : Use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this.
|
||||
|
||||
**If compiled with `LLAMA_SERVER_SSL=ON`**
|
||||
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
|
||||
|
@ -80,26 +81,26 @@ The project is under active development, and we are [looking for feedback and co
|
|||
|
||||
## Build
|
||||
|
||||
`server` is built alongside everything else from the root of the project
|
||||
`llama-server` is built alongside everything else from the root of the project
|
||||
|
||||
- Using `make`:
|
||||
|
||||
```bash
|
||||
make server
|
||||
make llama-server
|
||||
```
|
||||
|
||||
- Using `CMake`:
|
||||
|
||||
```bash
|
||||
cmake -B build
|
||||
cmake --build build --config Release -t server
|
||||
cmake --build build --config Release -t llama-server
|
||||
```
|
||||
|
||||
Binary is at `./build/bin/server`
|
||||
Binary is at `./build/bin/llama-server`
|
||||
|
||||
## Build with SSL
|
||||
|
||||
`server` can also be built with SSL support using OpenSSL 3
|
||||
`llama-server` can also be built with SSL support using OpenSSL 3
|
||||
|
||||
- Using `make`:
|
||||
|
||||
|
@ -107,14 +108,14 @@ The project is under active development, and we are [looking for feedback and co
|
|||
# NOTE: For non-system openssl, use the following:
|
||||
# CXXFLAGS="-I /path/to/openssl/include"
|
||||
# LDFLAGS="-L /path/to/openssl/lib"
|
||||
make LLAMA_SERVER_SSL=true server
|
||||
make LLAMA_SERVER_SSL=true llama-server
|
||||
```
|
||||
|
||||
- Using `CMake`:
|
||||
|
||||
```bash
|
||||
cmake -B build -DLLAMA_SERVER_SSL=ON
|
||||
cmake --build build --config Release -t server
|
||||
cmake --build build --config Release -t llama-server
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
@ -124,13 +125,13 @@ To get started right away, run the following command, making sure to use the cor
|
|||
### Unix-based systems (Linux, macOS, etc.)
|
||||
|
||||
```bash
|
||||
./server -m models/7B/ggml-model.gguf -c 2048
|
||||
./llama-server -m models/7B/ggml-model.gguf -c 2048
|
||||
```
|
||||
|
||||
### Windows
|
||||
|
||||
```powershell
|
||||
server.exe -m models\7B\ggml-model.gguf -c 2048
|
||||
llama-server.exe -m models\7B\ggml-model.gguf -c 2048
|
||||
```
|
||||
|
||||
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
||||
|
@ -629,11 +630,11 @@ bash chat.sh
|
|||
|
||||
### OAI-like API
|
||||
|
||||
The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi
|
||||
The HTTP `llama-server` supports an OAI-like API: https://github.com/openai/openai-openapi
|
||||
|
||||
### API errors
|
||||
|
||||
`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
|
||||
`llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi
|
||||
|
||||
Example of an error:
|
||||
|
||||
|
|
|
@ -99,7 +99,7 @@ The `bench.py` script does several steps:
|
|||
It aims to be used in the CI, but you can run it manually:
|
||||
|
||||
```shell
|
||||
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \
|
||||
LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/llama-server python bench.py \
|
||||
--runner-label local \
|
||||
--name local \
|
||||
--branch `git rev-parse --abbrev-ref HEAD` \
|
||||
|
|
|
@ -245,7 +245,7 @@ def start_server(args):
|
|||
|
||||
def start_server_background(args):
|
||||
# Start the server
|
||||
server_path = '../../../build/bin/server'
|
||||
server_path = '../../../build/bin/llama-server'
|
||||
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
||||
server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||
server_args = [
|
||||
|
|
|
@ -634,12 +634,12 @@ return html`
|
|||
<div>
|
||||
<div class="grammar">
|
||||
<label for="template"></label>
|
||||
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON-Scheme + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
||||
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
||||
</div>
|
||||
<div class="grammar-columns">
|
||||
<div class="json-schema-controls">
|
||||
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
||||
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON-Scheme</button>
|
||||
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
|
|
@ -24,6 +24,201 @@ function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
|
|||
return minItems === 0 ? `(${result})?` : result;
|
||||
}
|
||||
|
||||
function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) {
|
||||
const hasMin = minValue !== null;
|
||||
const hasMax = maxValue !== null;
|
||||
|
||||
function digitRange(fromChar, toChar) {
|
||||
out.push("[");
|
||||
if (fromChar === toChar) {
|
||||
out.push(fromChar);
|
||||
} else {
|
||||
out.push(fromChar);
|
||||
out.push("-");
|
||||
out.push(toChar);
|
||||
}
|
||||
out.push("]");
|
||||
}
|
||||
|
||||
function moreDigits(minDigits, maxDigits) {
|
||||
out.push("[0-9]");
|
||||
if (minDigits === maxDigits && minDigits === 1) {
|
||||
return;
|
||||
}
|
||||
out.push("{");
|
||||
out.push(minDigits.toString());
|
||||
if (maxDigits !== minDigits) {
|
||||
out.push(",");
|
||||
if (maxDigits !== Number.MAX_SAFE_INTEGER) {
|
||||
out.push(maxDigits.toString());
|
||||
}
|
||||
}
|
||||
out.push("}");
|
||||
}
|
||||
|
||||
function uniformRange(fromStr, toStr) {
|
||||
let i = 0;
|
||||
while (i < fromStr.length && fromStr[i] === toStr[i]) {
|
||||
i++;
|
||||
}
|
||||
if (i > 0) {
|
||||
out.push("\"");
|
||||
out.push(fromStr.slice(0, i));
|
||||
out.push("\"");
|
||||
}
|
||||
if (i < fromStr.length) {
|
||||
if (i > 0) {
|
||||
out.push(" ");
|
||||
}
|
||||
const subLen = fromStr.length - i - 1;
|
||||
if (subLen > 0) {
|
||||
const fromSub = fromStr.slice(i + 1);
|
||||
const toSub = toStr.slice(i + 1);
|
||||
const subZeros = "0".repeat(subLen);
|
||||
const subNines = "9".repeat(subLen);
|
||||
|
||||
let toReached = false;
|
||||
out.push("(");
|
||||
if (fromSub === subZeros) {
|
||||
digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1));
|
||||
out.push(" ");
|
||||
moreDigits(subLen, subLen);
|
||||
} else {
|
||||
out.push("[");
|
||||
out.push(fromStr[i]);
|
||||
out.push("] ");
|
||||
out.push("(");
|
||||
uniformRange(fromSub, subNines);
|
||||
out.push(")");
|
||||
if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) {
|
||||
out.push(" | ");
|
||||
if (toSub === subNines) {
|
||||
digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]);
|
||||
toReached = true;
|
||||
} else {
|
||||
digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1));
|
||||
}
|
||||
out.push(" ");
|
||||
moreDigits(subLen, subLen);
|
||||
}
|
||||
}
|
||||
if (!toReached) {
|
||||
out.push(" | ");
|
||||
digitRange(toStr[i], toStr[i]);
|
||||
out.push(" ");
|
||||
uniformRange(subZeros, toSub);
|
||||
}
|
||||
out.push(")");
|
||||
} else {
|
||||
out.push("[");
|
||||
out.push(fromStr[i]);
|
||||
out.push("-");
|
||||
out.push(toStr[i]);
|
||||
out.push("]");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (hasMin && hasMax) {
|
||||
if (minValue < 0 && maxValue < 0) {
|
||||
out.push("\"-\" (");
|
||||
_generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true);
|
||||
out.push(")");
|
||||
return;
|
||||
}
|
||||
|
||||
if (minValue < 0) {
|
||||
out.push("\"-\" (");
|
||||
_generateMinMaxInt(0, -minValue, out, decimalsLeft, true);
|
||||
out.push(") | ");
|
||||
minValue = 0;
|
||||
}
|
||||
|
||||
let minS = minValue.toString();
|
||||
const maxS = maxValue.toString();
|
||||
const minDigits = minS.length;
|
||||
const maxDigits = maxS.length;
|
||||
|
||||
for (let digits = minDigits; digits < maxDigits; digits++) {
|
||||
uniformRange(minS, "9".repeat(digits));
|
||||
minS = "1" + "0".repeat(digits);
|
||||
out.push(" | ");
|
||||
}
|
||||
uniformRange(minS, maxS);
|
||||
return;
|
||||
}
|
||||
|
||||
const lessDecimals = Math.max(decimalsLeft - 1, 1);
|
||||
|
||||
if (hasMin) {
|
||||
if (minValue < 0) {
|
||||
out.push("\"-\" (");
|
||||
_generateMinMaxInt(null, -minValue, out, decimalsLeft, false);
|
||||
out.push(") | [0] | [1-9] ");
|
||||
moreDigits(0, decimalsLeft - 1);
|
||||
} else if (minValue === 0) {
|
||||
if (topLevel) {
|
||||
out.push("[0] | [1-9] ");
|
||||
moreDigits(0, lessDecimals);
|
||||
} else {
|
||||
moreDigits(1, decimalsLeft);
|
||||
}
|
||||
} else if (minValue <= 9) {
|
||||
const c = minValue.toString();
|
||||
const range_start = topLevel ? '1' : '0';
|
||||
if (c > range_start) {
|
||||
digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1));
|
||||
out.push(" ");
|
||||
moreDigits(1, lessDecimals);
|
||||
out.push(" | ");
|
||||
}
|
||||
digitRange(c, "9");
|
||||
out.push(" ");
|
||||
moreDigits(0, lessDecimals);
|
||||
} else {
|
||||
const minS = minValue.toString();
|
||||
const length = minS.length;
|
||||
const c = minS[0];
|
||||
|
||||
if (c > "1") {
|
||||
digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1));
|
||||
out.push(" ");
|
||||
moreDigits(length, lessDecimals);
|
||||
out.push(" | ");
|
||||
}
|
||||
digitRange(c, c);
|
||||
out.push(" (");
|
||||
_generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false);
|
||||
out.push(")");
|
||||
if (c < "9") {
|
||||
out.push(" | ");
|
||||
digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9");
|
||||
out.push(" ");
|
||||
moreDigits(length - 1, lessDecimals);
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (hasMax) {
|
||||
if (maxValue >= 0) {
|
||||
if (topLevel) {
|
||||
out.push("\"-\" [1-9] ");
|
||||
moreDigits(0, lessDecimals);
|
||||
out.push(" | ");
|
||||
}
|
||||
_generateMinMaxInt(0, maxValue, out, decimalsLeft, true);
|
||||
} else {
|
||||
out.push("\"-\" (");
|
||||
_generateMinMaxInt(-maxValue, null, out, decimalsLeft, false);
|
||||
out.push(")");
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
throw new Error("At least one of minValue or maxValue must be set");
|
||||
}
|
||||
|
||||
class BuiltinRule {
|
||||
constructor(content, deps) {
|
||||
this.content = content;
|
||||
|
@ -64,7 +259,7 @@ const GRAMMAR_RANGE_LITERAL_ESCAPE_RE = /[\n\r"\]\-\\]/g;
|
|||
const GRAMMAR_LITERAL_ESCAPES = { '\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']': '\\]' };
|
||||
|
||||
const NON_LITERAL_SET = new Set('|.()[]{}*+?');
|
||||
const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('[]()|{}*+?');
|
||||
const ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = new Set('^$.[]()|{}*+?');
|
||||
|
||||
export class SchemaConverter {
|
||||
constructor(options) {
|
||||
|
@ -337,6 +532,64 @@ export class SchemaConverter {
|
|||
return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
|
||||
}
|
||||
|
||||
_notStrings(strings) {
|
||||
class TrieNode {
|
||||
constructor() {
|
||||
this.children = {};
|
||||
this.isEndOfString = false;
|
||||
}
|
||||
|
||||
insert(str) {
|
||||
let node = this;
|
||||
for (const c of str) {
|
||||
node = node.children[c] = node.children[c] || new TrieNode();
|
||||
}
|
||||
node.isEndOfString = true;
|
||||
}
|
||||
}
|
||||
|
||||
const trie = new TrieNode();
|
||||
for (const s of strings) {
|
||||
trie.insert(s);
|
||||
}
|
||||
|
||||
const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
|
||||
const out = ['["] ( '];
|
||||
|
||||
const visit = (node) => {
|
||||
const rejects = [];
|
||||
let first = true;
|
||||
for (const c of Object.keys(node.children).sort()) {
|
||||
const child = node.children[c];
|
||||
rejects.push(c);
|
||||
if (first) {
|
||||
first = false;
|
||||
} else {
|
||||
out.push(' | ');
|
||||
}
|
||||
out.push(`[${c}]`);
|
||||
if (Object.keys(child.children).length > 0) {
|
||||
out.push(' (');
|
||||
visit(child);
|
||||
out.push(')');
|
||||
} else if (child.isEndOfString) {
|
||||
out.push(` ${charRuleName}+`);
|
||||
}
|
||||
}
|
||||
if (Object.keys(node.children).length > 0) {
|
||||
if (!first) {
|
||||
out.push(' | ');
|
||||
}
|
||||
out.push(`[^"${rejects.join('')}] ${charRuleName}*`);
|
||||
}
|
||||
};
|
||||
|
||||
visit(trie);
|
||||
|
||||
out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`);
|
||||
return out.join('');
|
||||
}
|
||||
|
||||
_resolveRef(ref) {
|
||||
let refName = ref.split('/').pop();
|
||||
if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
|
||||
|
@ -363,11 +616,11 @@ export class SchemaConverter {
|
|||
} else if (schema.oneOf || schema.anyOf) {
|
||||
return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
|
||||
} else if (Array.isArray(schemaType)) {
|
||||
return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
|
||||
return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({...schema, type: t}))));
|
||||
} else if ('const' in schema) {
|
||||
return this._addRule(ruleName, this._generateConstantRule(schema.const));
|
||||
return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space');
|
||||
} else if ('enum' in schema) {
|
||||
const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
|
||||
const rule = '(' + schema.enum.map(v => this._generateConstantRule(v)).join(' | ') + ') space';
|
||||
return this._addRule(ruleName, rule);
|
||||
} else if ((schemaType === undefined || schemaType === 'object') &&
|
||||
('properties' in schema ||
|
||||
|
@ -404,7 +657,7 @@ export class SchemaConverter {
|
|||
}
|
||||
}
|
||||
|
||||
return this._addRule(ruleName, this._buildObjectRule(properties, required, name, /* additionalProperties= */ false));
|
||||
return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null));
|
||||
} else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
|
||||
const items = schema.items ?? schema.prefixItems;
|
||||
if (Array.isArray(items)) {
|
||||
|
@ -435,6 +688,24 @@ export class SchemaConverter {
|
|||
const minLen = schema.minLength || 0;
|
||||
const maxLen = schema.maxLength;
|
||||
return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
|
||||
} else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) {
|
||||
let minValue = null;
|
||||
let maxValue = null;
|
||||
if ('minimum' in schema) {
|
||||
minValue = schema.minimum;
|
||||
} else if ('exclusiveMinimum' in schema) {
|
||||
minValue = schema.exclusiveMinimum + 1;
|
||||
}
|
||||
if ('maximum' in schema) {
|
||||
maxValue = schema.maximum;
|
||||
} else if ('exclusiveMaximum' in schema) {
|
||||
maxValue = schema.exclusiveMaximum - 1;
|
||||
}
|
||||
|
||||
const out = ["("];
|
||||
_generateMinMaxInt(minValue, maxValue, out);
|
||||
out.push(") space");
|
||||
return this._addRule(ruleName, out.join(''));
|
||||
} else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
|
||||
return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
|
||||
} else {
|
||||
|
@ -480,12 +751,19 @@ export class SchemaConverter {
|
|||
const requiredProps = sortedProps.filter(k => required.has(k));
|
||||
const optionalProps = sortedProps.filter(k => !required.has(k));
|
||||
|
||||
if (typeof additionalProperties === 'object' || additionalProperties === true) {
|
||||
if (additionalProperties) {
|
||||
const subName = `${name ?? ''}${name ? '-' : ''}additional`;
|
||||
const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
|
||||
const valueRule =
|
||||
additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
|
||||
: this._addPrimitive('value', PRIMITIVE_RULES['value']);
|
||||
|
||||
const key_rule =
|
||||
sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string'])
|
||||
: this._addRule(`${subName}-k`, this._notStrings(sortedProps));
|
||||
|
||||
propKvRuleNames['*'] = this._addRule(
|
||||
`${subName}-kv`,
|
||||
`${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
|
||||
`${key_rule} ":" space ${valueRule}`);
|
||||
optionalProps.push('*');
|
||||
}
|
||||
|
||||
|
@ -502,15 +780,11 @@ export class SchemaConverter {
|
|||
const [k, ...rest] = ks;
|
||||
const kvRuleName = propKvRuleNames[k];
|
||||
let res;
|
||||
if (k === '*') {
|
||||
res = this._addRule(
|
||||
`${name ?? ''}${name ? '-' : ''}additional-kvs`,
|
||||
`${kvRuleName} ( "," space ` + kvRuleName + ` )*`
|
||||
)
|
||||
} else if (firstIsOptional) {
|
||||
res = `( "," space ${kvRuleName} )?`;
|
||||
const commaRef = `( "," space ${kvRuleName} )`;
|
||||
if (firstIsOptional) {
|
||||
res = commaRef + (k === '*' ? '*' : '?');
|
||||
} else {
|
||||
res = kvRuleName;
|
||||
res = kvRuleName + (k === '*' ? ' ' + commaRef + '*' : '');
|
||||
}
|
||||
if (rest.length > 0) {
|
||||
res += ' ' + this._addRule(
|
||||
|
|
|
@ -3,6 +3,13 @@
|
|||
|
||||
by Humans for All.
|
||||
|
||||
## quickstart
|
||||
|
||||
To run from the build dir
|
||||
|
||||
bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
|
||||
|
||||
Continue reading for the details.
|
||||
|
||||
## overview
|
||||
|
||||
|
@ -14,6 +21,8 @@ own system prompts.
|
|||
This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
|
||||
or potentially as it is being generated, in a streamed manner from the server/ai-model.
|
||||
|
||||

|
||||
|
||||
Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
|
||||
open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
|
||||
|
||||
|
@ -44,12 +53,12 @@ http module.
|
|||
|
||||
### running using examples/server
|
||||
|
||||
bin/server -m path/model.gguf --path ../examples/server/public_simplechat [--port PORT]
|
||||
./llama-server -m path/model.gguf --path examples/server/public_simplechat [--port PORT]
|
||||
|
||||
### running using python3's server module
|
||||
|
||||
first run examples/server
|
||||
* bin/server -m path/model.gguf
|
||||
* ./llama-server -m path/model.gguf
|
||||
|
||||
next run this web front end in examples/server/public_simplechat
|
||||
* cd ../examples/server/public_simplechat
|
||||
|
@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t
|
|||
The histogram/freq based trimming logic is currently tuned for english language wrt its
|
||||
is-it-a-alpabetic|numeral-char regex match logic.
|
||||
|
||||
chatRequestOptions - maintains the list of options/fields to send along with chat request,
|
||||
apiRequestOptions - maintains the list of options/fields to send along with api request,
|
||||
irrespective of whether /chat/completions or /completions endpoint.
|
||||
|
||||
If you want to add additional options/fields to send to the server/ai-model, and or
|
||||
modify the existing options value or remove them, for now you can update this global var
|
||||
using browser's development-tools/console.
|
||||
|
||||
For string and numeric fields in chatRequestOptions, including even those added by a user
|
||||
at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
|
||||
For string, numeric and boolean fields in apiRequestOptions, including even those added by a
|
||||
user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
|
||||
created.
|
||||
|
||||
cache_prompt option supported by example/server is allowed to be controlled by user, so that
|
||||
any caching supported wrt system-prompt and chat history, if usable can get used. When chat
|
||||
history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
|
||||
wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
|
||||
However system prompt should ideally get the benefit of caching.
|
||||
|
||||
headers - maintains the list of http headers sent when request is made to the server. By default
|
||||
Content-Type is set to application/json. Additionally Authorization entry is provided, which can
|
||||
be set if needed using the settings ui.
|
||||
|
@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t
|
|||
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
||||
|
||||
|
||||
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
|
||||
implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||
some extent in a simple crude way. You may also want to control the context size enabled when
|
||||
the server loads ai-model, on the server end.
|
||||
By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
|
||||
the implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||
some extent in a simple crude way. You may also want to control the context size enabled when the
|
||||
server loads ai-model, on the server end.
|
||||
|
||||
|
||||
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||
|
@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side.
|
|||
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
||||
to /completions endpoint handling code on server side.
|
||||
|
||||
NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
|
||||
wrt the set of fields sent to server along with the user query. To check how the model behaves
|
||||
NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
|
||||
wrt the set of fields sent to server along with the user query, to check how the model behaves
|
||||
wrt repeatations in general in the generated text response.
|
||||
|
||||
A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
|
||||
using the providing settings ui.
|
||||
using the provided settings ui (for settings exposed through the ui).
|
||||
|
||||
|
||||
### OpenAi / Equivalent API WebService
|
||||
|
@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below.
|
|||
* the baseUrl in settings ui
|
||||
* https://api.openai.com/v1 or similar
|
||||
|
||||
* Wrt request body - gMe.chatRequestOptions
|
||||
* Wrt request body - gMe.apiRequestOptions
|
||||
* model (settings ui)
|
||||
* any additional fields if required in future
|
||||
|
||||
|
|
|
@ -222,8 +222,8 @@ class SimpleChat {
|
|||
* @param {Object} obj
|
||||
*/
|
||||
request_jsonstr_extend(obj) {
|
||||
for(let k in gMe.chatRequestOptions) {
|
||||
obj[k] = gMe.chatRequestOptions[k];
|
||||
for(let k in gMe.apiRequestOptions) {
|
||||
obj[k] = gMe.apiRequestOptions[k];
|
||||
}
|
||||
if (gMe.bStream) {
|
||||
obj["stream"] = true;
|
||||
|
@ -740,11 +740,12 @@ class Me {
|
|||
"Authorization": "", // Authorization: Bearer OPENAI_API_KEY
|
||||
}
|
||||
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||
this.chatRequestOptions = {
|
||||
this.apiRequestOptions = {
|
||||
"model": "gpt-3.5-turbo",
|
||||
"temperature": 0.7,
|
||||
"max_tokens": 1024,
|
||||
"n_predict": 1024,
|
||||
"cache_prompt": false,
|
||||
//"frequency_penalty": 1.2,
|
||||
//"presence_penalty": 1.2,
|
||||
};
|
||||
|
@ -800,51 +801,55 @@ class Me {
|
|||
|
||||
ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
|
||||
|
||||
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
|
||||
|
||||
}
|
||||
|
||||
ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv);
|
||||
ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
|
||||
ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Auto create ui input elements for fields in ChatRequestOptions
|
||||
* Auto create ui input elements for fields in apiRequestOptions
|
||||
* Currently supports text and number field types.
|
||||
* @param {HTMLDivElement} elDiv
|
||||
*/
|
||||
show_settings_chatrequestoptions(elDiv) {
|
||||
show_settings_apirequestoptions(elDiv) {
|
||||
let typeDict = {
|
||||
"string": "text",
|
||||
"number": "number",
|
||||
};
|
||||
let fs = document.createElement("fieldset");
|
||||
let legend = document.createElement("legend");
|
||||
legend.innerText = "ChatRequestOptions";
|
||||
legend.innerText = "ApiRequestOptions";
|
||||
fs.appendChild(legend);
|
||||
elDiv.appendChild(fs);
|
||||
for(const k in this.chatRequestOptions) {
|
||||
let val = this.chatRequestOptions[k];
|
||||
for(const k in this.apiRequestOptions) {
|
||||
let val = this.apiRequestOptions[k];
|
||||
let type = typeof(val);
|
||||
if (!((type == "string") || (type == "number"))) {
|
||||
continue;
|
||||
if (((type == "string") || (type == "number"))) {
|
||||
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
|
||||
if (type == "number") {
|
||||
val = Number(val);
|
||||
}
|
||||
this.apiRequestOptions[k] = val;
|
||||
});
|
||||
fs.appendChild(inp.div);
|
||||
} else if (type == "boolean") {
|
||||
let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
|
||||
this.apiRequestOptions[k] = userVal;
|
||||
});
|
||||
fs.appendChild(bbtn.div);
|
||||
}
|
||||
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{
|
||||
if (type == "number") {
|
||||
val = Number(val);
|
||||
}
|
||||
this.chatRequestOptions[k] = val;
|
||||
});
|
||||
fs.appendChild(inp.div);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -870,6 +875,23 @@ class Me {
|
|||
});
|
||||
elDiv.appendChild(bb.div);
|
||||
|
||||
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
|
||||
this.bTrimGarbage = val;
|
||||
});
|
||||
elDiv.appendChild(bb.div);
|
||||
|
||||
this.show_settings_apirequestoptions(elDiv);
|
||||
|
||||
let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
|
||||
this.apiEP = ApiEP.Type[val];
|
||||
});
|
||||
elDiv.appendChild(sel.div);
|
||||
|
||||
sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
|
||||
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
|
||||
});
|
||||
elDiv.appendChild(sel.div);
|
||||
|
||||
bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
|
||||
this.bCompletionFreshChatAlways = val;
|
||||
});
|
||||
|
@ -880,23 +902,6 @@ class Me {
|
|||
});
|
||||
elDiv.appendChild(bb.div);
|
||||
|
||||
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
|
||||
this.bTrimGarbage = val;
|
||||
});
|
||||
elDiv.appendChild(bb.div);
|
||||
|
||||
let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
|
||||
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
|
||||
});
|
||||
elDiv.appendChild(sel.div);
|
||||
|
||||
sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
|
||||
this.apiEP = ApiEP.Type[val];
|
||||
});
|
||||
elDiv.appendChild(sel.div);
|
||||
|
||||
this.show_settings_chatrequestoptions(elDiv);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
BIN
examples/server/public_simplechat/simplechat_screens.webp
Normal file
BIN
examples/server/public_simplechat/simplechat_screens.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
|
@ -1594,7 +1594,7 @@ struct server_context {
|
|||
} else {
|
||||
std::string prompt;
|
||||
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
|
||||
json_value(task.data, "prompt", std::string());
|
||||
prompt = json_value(task.data, "prompt", std::string());
|
||||
}
|
||||
|
||||
slot = get_available_slot(prompt);
|
||||
|
@ -2020,6 +2020,7 @@ struct server_context {
|
|||
slot.t_start_generation = 0;
|
||||
|
||||
if (slot.infill) {
|
||||
const bool add_bos = llama_should_add_bos_token(model);
|
||||
bool suff_rm_leading_spc = true;
|
||||
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||
params.input_suffix.erase(0, 1);
|
||||
|
@ -2035,11 +2036,21 @@ struct server_context {
|
|||
}
|
||||
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
|
||||
prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
|
||||
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
|
||||
prefix_tokens.push_back(llama_token_middle(model));
|
||||
prompt_tokens = prefix_tokens;
|
||||
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
|
||||
|
||||
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
|
||||
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
|
||||
if (add_bos) {
|
||||
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
||||
}
|
||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||
|
||||
const llama_token middle_token = llama_token_middle(model);
|
||||
if (middle_token >= 0) {
|
||||
embd_inp.push_back(middle_token);
|
||||
}
|
||||
|
||||
prompt_tokens = embd_inp;
|
||||
} else {
|
||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
||||
}
|
||||
|
@ -2606,17 +2617,9 @@ int main(int argc, char ** argv) {
|
|||
|
||||
// print sample chat example to make it clear which template is used
|
||||
{
|
||||
json chat;
|
||||
chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
|
||||
chat.push_back({{"role", "user"}, {"content", "Hello"}});
|
||||
chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
|
||||
chat.push_back({{"role", "user"}, {"content", "How are you?"}});
|
||||
|
||||
const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
|
||||
|
||||
LOG_INFO("chat template", {
|
||||
{"chat_example", chat_example},
|
||||
{"built_in", params.chat_template.empty()},
|
||||
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
|
||||
{"built_in", params.chat_template.empty()},
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -27,10 +27,8 @@ To mitigate it, you can increase values in `n_predict`, `kv_size`.
|
|||
|
||||
```shell
|
||||
cd ../../..
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_CURL=ON ../
|
||||
cmake --build . --target server
|
||||
cmake -B build -DLLAMA_CURL=ON
|
||||
cmake --build build --target llama-server
|
||||
```
|
||||
|
||||
2. Start the test: `./tests.sh`
|
||||
|
@ -40,7 +38,7 @@ It's possible to override some scenario steps values with environment variables:
|
|||
| variable | description |
|
||||
|--------------------------|------------------------------------------------------------------------------------------------|
|
||||
| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` |
|
||||
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/server` |
|
||||
| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/llama-server` |
|
||||
| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` |
|
||||
| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format |
|
||||
| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` |
|
||||
|
|
|
@ -82,7 +82,7 @@ Feature: llama.cpp server
|
|||
|
||||
Examples: Prompts
|
||||
| response_format | n_predicted | re_content |
|
||||
| {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" |
|
||||
| {"type": "json_object", "schema": {"const": "42"}} | 6 | "42" |
|
||||
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
|
||||
| {"type": "json_object"} | 10 | \{ " Jacky. |
|
||||
|
||||
|
|
|
@ -1272,9 +1272,9 @@ def context_text(context):
|
|||
|
||||
def start_server_background(context):
|
||||
if os.name == 'nt':
|
||||
context.server_path = '../../../build/bin/Release/server.exe'
|
||||
context.server_path = '../../../build/bin/Release/llama-server.exe'
|
||||
else:
|
||||
context.server_path = '../../../build/bin/server'
|
||||
context.server_path = '../../../build/bin/llama-server'
|
||||
if 'LLAMA_SERVER_BIN_PATH' in os.environ:
|
||||
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||
server_listen_addr = context.server_fqdn
|
||||
|
|
|
@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin
|
|||
|
||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
||||
size_t alloc_size = 0;
|
||||
// vector holding all allocated string to be passed to llama_chat_apply_template
|
||||
std::vector<std::string> str(messages.size() * 2);
|
||||
std::vector<llama_chat_message> chat(messages.size());
|
||||
std::vector<llama_chat_msg> chat;
|
||||
|
||||
for (size_t i = 0; i < messages.size(); ++i) {
|
||||
const auto & curr_msg = messages[i];
|
||||
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
|
||||
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
|
||||
alloc_size += str[i*2 + 1].length();
|
||||
chat[i].role = str[i*2 + 0].c_str();
|
||||
chat[i].content = str[i*2 + 1].c_str();
|
||||
std::string role = json_value(curr_msg, "role", std::string(""));
|
||||
std::string content = json_value(curr_msg, "content", std::string(""));
|
||||
chat.push_back({role, content});
|
||||
}
|
||||
|
||||
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
||||
std::vector<char> buf(alloc_size * 2);
|
||||
|
||||
// run the first time to get the total output length
|
||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
||||
|
||||
// if it turns out that our buffer is too small, we resize it
|
||||
if ((size_t) res > buf.size()) {
|
||||
buf.resize(res);
|
||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
||||
}
|
||||
|
||||
const std::string formatted_chat(buf.data(), res);
|
||||
|
||||
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
|
||||
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
||||
|
||||
return formatted_chat;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue