diff --git a/Makefile b/Makefile index eb1da90f1..06b831264 100644 --- a/Makefile +++ b/Makefile @@ -1455,21 +1455,8 @@ llama-server: \ examples/server/server.cpp \ examples/server/utils.hpp \ examples/server/httplib.h \ - examples/server/colorthemes.css.hpp \ - examples/server/style.css.hpp \ - examples/server/theme-beeninorder.css.hpp \ - examples/server/theme-ketivah.css.hpp \ - examples/server/theme-mangotango.css.hpp \ - examples/server/theme-playground.css.hpp \ - examples/server/theme-polarnight.css.hpp \ - examples/server/theme-snowstorm.css.hpp \ examples/server/index.html.hpp \ - examples/server/index-new.html.hpp \ - examples/server/index.js.hpp \ examples/server/completion.js.hpp \ - examples/server/system-prompts.js.hpp \ - examples/server/prompt-formats.js.hpp \ - examples/server/json-schema-to-grammar.mjs.hpp \ examples/server/loading.html.hpp \ common/json.hpp \ common/stb_image.h \ diff --git a/examples/server/public/colorthemes.css b/examples/server/public_legacy/colorthemes.css similarity index 100% rename from examples/server/public/colorthemes.css rename to examples/server/public_legacy/colorthemes.css diff --git a/examples/server/public_legacy/completion.js b/examples/server/public_legacy/completion.js new file mode 100644 index 000000000..30df7c2fa --- /dev/null +++ b/examples/server/public_legacy/completion.js @@ -0,0 +1,209 @@ +const paramDefaults = { + stream: true, + n_predict: 500, + temperature: 0.2, + stop: [""] +}; + +let generation_settings = null; + + +// Completes the prompt as a generator. Recommended for most use cases. +// +// Example: +// +// import { llama } from '/completion.js' +// +// const request = llama("Tell me a joke", {n_predict: 800}) +// for await (const chunk of request) { +// document.write(chunk.data.content) +// } +// +export async function* llama(prompt, params = {}, config = {}) { + let controller = config.controller; + const api_url = config.api_url?.replace(/\/+$/, '') || ""; + + if (!controller) { + controller = new AbortController(); + } + + const completionParams = { ...paramDefaults, ...params, prompt }; + + const response = await fetch(`${api_url}${config.endpoint || '/completion'}`, { + method: 'POST', + body: JSON.stringify(completionParams), + headers: { + 'Connection': 'keep-alive', + 'Content-Type': 'application/json', + 'Accept': 'text/event-stream', + ...(params.api_key ? {'Authorization': `Bearer ${params.api_key}`} : {}) + }, + signal: controller.signal, + }); + + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + + let content = ""; + let leftover = ""; // Buffer for partially read lines + + try { + let cont = true; + + while (cont) { + const result = await reader.read(); + if (result.done) { + break; + } + + // Add any leftover data to the current chunk of data + const text = leftover + decoder.decode(result.value); + + // Check if the last character is a line break + const endsWithLineBreak = text.endsWith('\n'); + + // Split the text into lines + let lines = text.split('\n'); + + // If the text doesn't end with a line break, then the last line is incomplete + // Store it in leftover to be added to the next chunk of data + if (!endsWithLineBreak) { + leftover = lines.pop(); + } else { + leftover = ""; // Reset leftover if we have a line break at the end + } + + // Parse all sse events and add them to result + const regex = /^(\S+):\s(.*)$/gm; + for (const line of lines) { + const match = regex.exec(line); + if (match) { + result[match[1]] = match[2]; + if (result.data === '[DONE]') { + cont = false; + break; + } + + // since we know this is llama.cpp, let's just decode the json in data + if (result.data) { + result.data = JSON.parse(result.data); + content += result.data.content; + + // yield + yield result; + + // if we got a stop token from server, we will break here + if (result.data.stop) { + if (result.data.generation_settings) { + generation_settings = result.data.generation_settings; + } + cont = false; + break; + } + } + if (result.error) { + try { + result.error = JSON.parse(result.error); + if (result.error.message.includes('slot unavailable')) { + // Throw an error to be caught by upstream callers + throw new Error('slot unavailable'); + } else { + console.error(`llama.cpp error [${result.error.code} - ${result.error.type}]: ${result.error.message}`); + } + } catch(e) { + console.error(`llama.cpp error ${result.error}`) + } + } + } + } + } + } catch (e) { + if (e.name !== 'AbortError') { + console.error("llama error: ", e); + } + throw e; + } + finally { + controller.abort(); + } + + return content; +} + +// Call llama, return an event target that you can subscribe to +// +// Example: +// +// import { llamaEventTarget } from '/completion.js' +// +// const conn = llamaEventTarget(prompt) +// conn.addEventListener("message", (chunk) => { +// document.write(chunk.detail.content) +// }) +// +export const llamaEventTarget = (prompt, params = {}, config = {}) => { + const eventTarget = new EventTarget(); + (async () => { + let content = ""; + for await (const chunk of llama(prompt, params, config)) { + if (chunk.data) { + content += chunk.data.content; + eventTarget.dispatchEvent(new CustomEvent("message", { detail: chunk.data })); + } + if (chunk.data.generation_settings) { + eventTarget.dispatchEvent(new CustomEvent("generation_settings", { detail: chunk.data.generation_settings })); + } + if (chunk.data.timings) { + eventTarget.dispatchEvent(new CustomEvent("timings", { detail: chunk.data.timings })); + } + } + eventTarget.dispatchEvent(new CustomEvent("done", { detail: { content } })); + })(); + return eventTarget; +} + +// Call llama, return a promise that resolves to the completed text. This does not support streaming +// +// Example: +// +// llamaPromise(prompt).then((content) => { +// document.write(content) +// }) +// +// or +// +// const content = await llamaPromise(prompt) +// document.write(content) +// +export const llamaPromise = (prompt, params = {}, config = {}) => { + return new Promise(async (resolve, reject) => { + let content = ""; + try { + for await (const chunk of llama(prompt, params, config)) { + content += chunk.data.content; + } + resolve(content); + } catch (error) { + reject(error); + } + }); +}; + +/** + * (deprecated) + */ +export const llamaComplete = async (params, controller, callback) => { + for await (const chunk of llama(params.prompt, params, { controller })) { + callback(chunk); + } +} + +// Get the model info from the server. This is useful for getting the context window and so on. +export const llamaModelInfo = async (config = {}) => { + if (!generation_settings) { + const api_url = config.api_url?.replace(/\/+$/, '') || ""; + const props = await fetch(`${api_url}/props`).then(r => r.json()); + generation_settings = props.default_generation_settings; + } + return generation_settings; +} diff --git a/examples/server/public/favicon.ico b/examples/server/public_legacy/favicon.ico similarity index 100% rename from examples/server/public/favicon.ico rename to examples/server/public_legacy/favicon.ico diff --git a/examples/server/public/index-new.html b/examples/server/public_legacy/index-new.html similarity index 100% rename from examples/server/public/index-new.html rename to examples/server/public_legacy/index-new.html diff --git a/examples/server/public_legacy/index.html b/examples/server/public_legacy/index.html new file mode 100644 index 000000000..a95f5c6df --- /dev/null +++ b/examples/server/public_legacy/index.html @@ -0,0 +1,1303 @@ + +
+ + + +