server : add comment about changing slot_state to bool

server : fix slot reuse
server : apply fix from #3722
2023-10-22 22:24:39 +03:00 · 2023-10-22 21:57:23 +03:00 · 2023-10-22 21:05:45 +03:00 · 2023-10-22 20:09:25 +03:00 · 2023-10-22 20:03:35 +03:00 · 2023-10-22 19:52:50 +03:00
12 changed files with 3980 additions and 2950 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,7 @@
 *.gcno
 *.gcda
 *.dot
+*.bat
 *.metallib
 .DS_Store
 .build/
--- a/4
+++ b/4
@ -605,8 +605,8 @@ embedding: examples/embedding/embedding.cpp                   build-info.h ggml.
 save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
-	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2)
+server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual

 gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
--- a/build.zig
+++ b/build.zig
@ -131,6 +131,7 @@ pub fn build(b: *std.build.Builder) !void {
    const sampling = make.obj("sampling", "common/sampling.cpp");
    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
    const train = make.obj("train", "common/train.cpp");
+    const clip = make.obj("clip", "examples/llava/clip.cpp");

    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser });
    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common });
@ -139,7 +140,7 @@ pub fn build(b: *std.build.Builder) !void {
    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });
    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train });

-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser, clip });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@ -1,7 +1,7 @@
 set(TARGET clip)
 add_library(${TARGET} clip.cpp clip.h)
 install(TARGETS ${TARGET} LIBRARY)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common ggml ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 if (NOT MSVC)
    target_compile_options(${TARGET} PRIVATE -Wno-cast-qual) # stb_image.h
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@ -610,8 +610,8 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
        int idx_mean = get_key_idx(ctx, KEY_IMAGE_MEAN);
        int idx_std = get_key_idx(ctx, KEY_IMAGE_STD);
        for (int i = 0; i < 3; ++i) {
-            new_clip->image_mean[i] = *((float *)gguf_get_arr_data(ctx, idx_mean));
-            new_clip->image_std[i] = *((float *)gguf_get_arr_data(ctx, idx_std));
+            new_clip->image_mean[i] = *((const float *)gguf_get_arr_data(ctx, idx_mean));
+            new_clip->image_std[i] = *((const float *)gguf_get_arr_data(ctx, idx_std));
        }

        if (verbosity >= 2) {
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -6,7 +6,7 @@ install(TARGETS ${TARGET} RUNTIME)
 target_compile_definitions(${TARGET} PRIVATE
    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
 )
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama clip ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
 endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -24,6 +24,10 @@ Command line options:
 -   `--port`: Set the port to listen. Default: `8080`.
 -   `--path`: path from which to serve static files (default examples/server/public)
 -   `--embedding`: Enable embedding extraction, Default: disabled.
+-   `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1)
+-   `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
+-   `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+-   `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.

 ## Build

@ -158,6 +162,8 @@ node index.js

    `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0)

+    `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:` In this case, `[img-12]` will be replaced by the embeddings of the image id 12 in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
+
    *Result JSON:*

    Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
@ -188,6 +194,12 @@ node index.js

    `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)

+    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
+
+    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
+
+    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+
 -   **POST** `/tokenize`: Tokenize a given text.

    *Options:*
@ -218,8 +230,32 @@ node index.js

    It also accepts all the options of `/completion` except `stream` and `prompt`.

+-   **GET** `/props`: Return the required assistant name and anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
+
 ## More examples

+### Change system prompt on runtime
+
+To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it.
+
+`prompt`: Specify a context that you want all connecting clients to respect.
+
+`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
+
+`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
+
+```json
+{
+    "system_prompt": {
+        "prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
+        "anti_prompt": "User:",
+        "assistant_name": "Assistant:"
+    }
+}
+```
+
+**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
+
 ### Interactive mode

 Check the sample in [chat.mjs](chat.mjs).
--- a/examples/server/api_like_OAI.py
+++ b/examples/server/api_like_OAI.py
@ -8,6 +8,7 @@ import json


 app = Flask(__name__)
+slot_id = -1

 parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
 parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
@ -77,7 +78,8 @@ def make_postData(body, chat=False, stream=False):
    if(is_present(body, "stop")): postData["stop"] += body["stop"]
    postData["n_keep"] = -1
    postData["stream"] = stream
-
+    postData["cache_prompt"] = True
+    postData["slot_id"] = slot_id
    return postData

 def make_resData(data, chat=False, promptToken=[]):
@ -128,6 +130,7 @@ def make_resData_stream(data, chat=False, time_now = 0, start=False):
            }
        ]
    }
+    slot_id = data["slot_id"]
    if (chat):
        if (start):
            resData["choices"][0]["delta"] =  {
--- a/examples/server/chat.mjs
+++ b/examples/server/chat.mjs
@ -7,6 +7,11 @@ const args = process.argv.slice(2);
 const grammarJsonSchemaFile = args.find(
    (_, index) => args[index - 1] === "--grammar-json-schema"
 );
+
+const no_cached_prompt = args.find(
+    (_, index) => args[index - 1] === "--no-cache-prompt"
+) ?? "false";
+
 const grammarFile = args.find((_, index) => args[index - 1] === "--grammar");

 // Example usage: function,arguments
@ -30,6 +35,9 @@ if (grammarFile) {
    grammar = readFileSync(grammarFile, 'utf-8')
 }

+// for cached prompt
+let slot_id = -1;
+
 const API_URL = 'http://127.0.0.1:8080'

 const chat = [
@ -76,6 +84,8 @@ async function chat_completion(question) {
            top_p: 0.9,
            n_keep: n_keep,
            n_predict: 256,
+            cache_prompt: no_cached_prompt === "false",
+            slot_id: slot_id,
            stop: ["\n### Human:"], // stop completion after generating this
            grammar,
            stream: true,
@ -92,6 +102,7 @@ async function chat_completion(question) {
        const t = Buffer.from(chunk).toString('utf8')
        if (t.startsWith('data: ')) {
            const message = JSON.parse(t.substring(6))
+            slot_id = message.slot_id
            answer += message.content
            process.stdout.write(message.content)
            if (message.stop) {
--- a/examples/server/index.html.hpp
+++ b/examples/server/index.html.hpp
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@ -125,6 +125,7 @@
      background-color: #222;
      color: #ddd;
    }
+
    code {
      font-family: monospace;
      padding: 0.1em 0.3em;
@ -141,7 +142,8 @@
      display: inline;
    }

-    header, footer {
+    header,
+    footer {
      text-align: center;
    }

@ -163,6 +165,7 @@
      0% {
        background-position: 0%;
      }
+
      100% {
        background-position: 100%;
      }
@ -181,6 +184,7 @@
        --loading-color-1: #22222200;
        --loading-color-2: #222222ff;
      }
+
      .popover-content {
        background-color: black;
      }
@ -194,6 +198,8 @@

    import { llama } from '/completion.js';
    import { SchemaConverter } from '/json-schema-to-grammar.mjs';
+    let selected_image = false;
+    var slot_id = -1;

    const session = signal({
      prompt: "This is a conversation between User and Llama, a friendly chatbot. Llama is helpful, kind, honest, good at writing, and never fails to answer any requests immediately and with precision.",
@ -203,6 +209,7 @@
      type: "chat",  // "chat" | "completion"
      char: "Llama",
      user: "User",
+      image_selected: ''
    })

    const params = signal({
@ -220,7 +227,9 @@
      mirostat_tau: 5, // target entropy
      mirostat_eta: 0.1, // learning rate
      grammar: '',
-      n_probs: 0, // no completion_probabilities
+      n_probs: 0, // no completion_probabilities,
+      image_data: [],
+      cache_prompt: true
    })

    /* START: Support for storing prompt templates and parameters in borwser LocalStorage */
@ -270,6 +279,7 @@
      // saved templates were successfuly imported.

      console.log('Processing saved templates and updating default template')
+      params.value = { ...params.value, image_data: [] };

      //console.log(importedTemplates);
      savedUserTemplates.value = importedTemplates;
@ -294,7 +304,9 @@

    function userTemplateApply(t) {
      session.value = t.data.session;
+      session.value = { ...session.value, image_selected: '' };
      params.value = t.data.params;
+      params.value = { ...params.value, image_data: [] };
    }

    function userTemplateResetToDefaultAndApply() {
@ -385,7 +397,7 @@
        throw new Error("already running");
      }
      controller.value = new AbortController();
-      for await (const chunk of llama(prompt, llamaParams, {controller: controller.value})) {
+      for await (const chunk of llama(prompt, llamaParams, { controller: controller.value })) {
        const data = chunk.data;

        if (data.stop) {
@ -399,6 +411,11 @@
          console.log("Completion finished: '", currentMessages.map(msg => msg.content).join(''), "', summary: ", data);
        } else {
          currentMessages.push(data);
+          slot_id = data.slot_id;
+          if (selected_image && !data.multimodal) {
+            alert("The server was not compiled for multimodal or the model projector can't be loaded.");
+            return;
+          }
          transcriptUpdate([...history, [char, currentMessages]])
        }

@ -419,7 +436,7 @@

      transcriptUpdate([...session.value.transcript, ["{{user}}", msg]])

-      const prompt = template(session.value.template, {
+      let prompt = template(session.value.template, {
        message: msg,
        history: session.value.transcript.flatMap(
          ([name, data]) =>
@ -434,9 +451,12 @@
            )
        ).join("\n"),
      });
-
+      if (selected_image) {
+        prompt = `A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:[img-10]${msg}\nASSISTANT:`;
+      }
      await runLlama(prompt, {
        ...params.value,
+        slot_id: slot_id,
        stop: ["</s>", template("{{char}}:"), template("{{user}}:")],
      }, "{{char}}");
    }
@ -446,10 +466,11 @@
        console.log('already running...');
        return;
      }
-      const {prompt} = session.value;
+      const { prompt } = session.value;
      transcriptUpdate([...session.value.transcript, ["", prompt]]);
      await runLlama(prompt, {
        ...params.value,
+        slot_id: slot_id,
        stop: [],
      }, "");
    }
@ -467,6 +488,27 @@
      transcriptUpdate([]);
    }

+    const uploadImage = (e) => {
+      e.preventDefault();
+      document.getElementById("fileInput").click();
+      document.getElementById("fileInput").addEventListener("change", function (event) {
+        const selectedFile = event.target.files[0];
+        if (selectedFile) {
+          const reader = new FileReader();
+          reader.onload = function () {
+            const image_data = reader.result;
+            session.value = { ...session.value, image_selected: image_data };
+            params.value = {
+              ...params.value, image_data: [
+                { data: image_data.replace(/data:image\/[^;]+;base64,/, ''), id: 10 }]
+            }
+          };
+          selected_image = true;
+          reader.readAsDataURL(selectedFile);
+        }
+      });
+    }
+
    function MessageInput() {
      const message = useSignal("")

@ -497,6 +539,7 @@
          </div>
          <div class="right">
            <button type="submit" disabled=${generating.value}>Send</button>
+            <button onclick=${uploadImage}>Upload Image</button>
            <button onclick=${stop} disabled=${!generating.value}>Stop</button>
            <button onclick=${reset}>Reset</button>
          </div>
@ -540,7 +583,7 @@
            data;
          message = html`<${Markdownish} text=${template(text)} />`
        }
-        if(user) {
+        if (user) {
          return html`<p key=${index}><strong>${template(user)}:</strong> ${message}</p>`
        } else {
          return html`<p key=${index}>${message}</p>`
@ -549,6 +592,7 @@

      return html`
        <section id="chat" ref=${container}>
+          <img style="width: 60%;${!session.value.image_selected ? `display: none;` : ``}" src="${session.value.image_selected}"/>
          ${messages.flatMap(chatLine)}
        </section>`;
    };
@ -567,7 +611,7 @@
          const converter = new SchemaConverter(
            grammarJsonSchemaPropOrder.value
              .split(',')
-              .reduce((acc, cur, i) => ({...acc, [cur.trim()]: i}), {})
+              .reduce((acc, cur, i) => ({ ...acc, [cur.trim()]: i }), {})
          )
          converter.visit(schema, '')
          params.value = {
@ -579,7 +623,7 @@
        }
      }

-      const FloatField = ({label, max, min, name, step, value}) => {
+      const FloatField = ({ label, max, min, name, step, value }) => {
        return html`
          <div>
            <label for="${name}">${label}</label>
@ -589,7 +633,7 @@
        `
      };

-      const IntField = ({label, max, min, name, value}) => {
+      const IntField = ({ label, max, min, name, value }) => {
        return html`
          <div>
            <label for="${name}">${label}</label>
@ -694,20 +738,20 @@
          ${session.value.type === 'chat' ? ChatConfigForm() : CompletionConfigForm()}

          <fieldset class="two">
-            ${IntField({label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict})}
-            ${FloatField({label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature})}
-            ${FloatField({label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty})}
-            ${IntField({label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n})}
-            ${IntField({label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k})}
-            ${FloatField({label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p})}
+            ${IntField({ label: "Predictions", max: 2048, min: -1, name: "n_predict", value: params.value.n_predict })}
+            ${FloatField({ label: "Temperature", max: 1.5, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })}
+            ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })}
+            ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })}
+            ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })}
+            ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })}
          </fieldset>
          <details>
            <summary>More options</summary>
            <fieldset class="two">
-              ${FloatField({label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z})}
-              ${FloatField({label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p})}
-              ${FloatField({label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty})}
-              ${FloatField({label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty})}
+              ${FloatField({ label: "TFS-Z", max: 1.0, min: 0.0, name: "tfs_z", step: 0.01, value: params.value.tfs_z })}
+              ${FloatField({ label: "Typical P", max: 1.0, min: 0.0, name: "typical_p", step: 0.01, value: params.value.typical_p })}
+              ${FloatField({ label: "Presence penalty", max: 1.0, min: 0.0, name: "presence_penalty", step: 0.01, value: params.value.presence_penalty })}
+              ${FloatField({ label: "Frequency penalty", max: 1.0, min: 0.0, name: "frequency_penalty", step: 0.01, value: params.value.frequency_penalty })}
            </fieldset>
            <hr />
            <fieldset class="three">
@ -716,11 +760,11 @@
                <label><input type="radio" name="mirostat" value="1" checked=${params.value.mirostat == 1} oninput=${updateParamsInt} /> Mirostat v1</label>
                <label><input type="radio" name="mirostat" value="2" checked=${params.value.mirostat == 2} oninput=${updateParamsInt} /> Mirostat v2</label>
              </div>
-              ${FloatField({label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau})}
-              ${FloatField({label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta})}
+              ${FloatField({ label: "Mirostat tau", max: 10.0, min: 0.0, name: "mirostat_tau", step: 0.01, value: params.value.mirostat_tau })}
+              ${FloatField({ label: "Mirostat eta", max: 1.0, min: 0.0, name: "mirostat_eta", step: 0.01, value: params.value.mirostat_eta })}
            </fieldset>
            <fieldset>
-              ${IntField({label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs})}
+              ${IntField({ label: "Show Probabilities", max: 10, min: 0, name: "n_probs", value: params.value.n_probs })}
            </fieldset>
          </details>
        </form>
@ -952,8 +996,11 @@
 </head>

 <body>
-  <div id="container"></div>
+  <div id="container">
+    <input type="file" id="fileInput" accept="image/*" style="display: none;">
+  </div>
  <div id="portal"></div>
 </body>

 </html>
+
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
Author	SHA1	Message	Date
Georgi Gerganov	c0f4d54870	server : add comment about changing slot_state to bool	2023-10-22 22:24:39 +03:00
Georgi Gerganov	83e1490187	server : fix slot reuse	2023-10-22 21:57:23 +03:00
Georgi Gerganov	8fe7ca4875	server : apply fix from #3722	2023-10-22 21:05:45 +03:00
Georgi Gerganov	00ae55b388	server : hide ctx_sampling->prev behind API (#3696 )	2023-10-22 20:09:25 +03:00
M. Yusuf Sarıgöz	3d6a687f1d	Update readme to document multimodal in server	2023-10-22 20:03:35 +03:00
Georgi Gerganov	dd1af2ed35	server : minor style	2023-10-22 19:52:50 +03:00
M. Yusuf Sarıgöz	a4d69d8b81	Merge branch 'server-rev' of https://github.com//ggerganov/llama.cpp into server-rev	2023-10-22 19:49:48 +03:00
M. Yusuf Sarıgöz	2679c432d5	Update readme to document multimodal in server	2023-10-22 19:49:33 +03:00
Georgi Gerganov	a8063171bd	server : completion requests remember slot_id	2023-10-22 19:34:48 +03:00
Georgi Gerganov	f305d6434f	editorconfig : new line in index.html	2023-10-22 19:10:30 +03:00
M. Yusuf Sarıgöz	5359fb9267	Do not save/load image_data to localStorage	2023-10-22 19:08:09 +03:00
Georgi Gerganov	f67d971344	server : bug fix for prompt caching	2023-10-22 17:52:59 +03:00
Georgi Gerganov	569ebf11cf	server : refactor ctx_sampling init + n_ctx + names	2023-10-22 16:57:05 +03:00
Georgi Gerganov	ef18f4d579	server : fix crash in Debug on macOS (I have no idea why this fixes it!?)	2023-10-22 16:55:40 +03:00
Georgi Gerganov	197a0a9e23	server : fix switch fallthrough	2023-10-22 16:55:05 +03:00
Georgi Gerganov	715f384a6b	clip : link to ggml, not to llama	2023-10-22 16:52:12 +03:00
Georgi Gerganov	4b4ab722ab	make : silence stb warnings	2023-10-22 16:51:59 +03:00
Georgi Gerganov	176993c871	Merge branch 'master' into server-rev	2023-10-22 15:04:16 +03:00
FSSRepo	2eb4c11ec5	fix image load + view image in chat	2023-10-21 14:34:19 -04:00
Jhen-Jie Hong	17b23eb9cb	server : fix multibyte handle in partial response (#3706 )	2023-10-21 14:58:03 +03:00
Georgi Gerganov	778c070d1b	server : logs + minor code style	2023-10-20 20:44:51 +03:00
Georgi Gerganov	5d540e80d1	server : no need for atomic int - already using mutex	2023-10-20 20:44:29 +03:00
Georgi Gerganov	113dd60005	server : bach has to be allocated for n_parallel sequences	2023-10-20 20:42:45 +03:00
FSSRepo	6b2437e32d	added thread safe pipeline	2023-10-20 12:07:32 -04:00
Georgi Gerganov	325d1793f7	server : minor sync	2023-10-19 15:03:24 +03:00
Georgi Gerganov	9740824ba5	server : snake case	2023-10-19 14:44:37 +03:00
Georgi Gerganov	e3a2c3fe32	server : use refs + use llama_batch_clear()	2023-10-19 14:44:04 +03:00
Georgi Gerganov	3d5929e8ee	server : bug fix in ingest_images n_tokens is incremented internally by llama_batch_add	2023-10-19 14:43:19 +03:00
Georgi Gerganov	a8c981b734	server : remove beam-search functionality	2023-10-19 14:10:37 +03:00
Georgi Gerganov	654e0a1fe0	server : coding-style normalization (part 2)	2023-10-19 14:09:45 +03:00
Georgi Gerganov	e44ed60187	server : coding-style normalization	2023-10-19 13:50:23 +03:00
FSSRepo	ab2fc00224	latest changes of sampling API	2023-10-18 16:57:48 -04:00
FSSRepo	8540568c48	Merge branch 'master' of https://github.com/ggerganov/llama.cpp	2023-10-18 16:55:26 -04:00
FSSRepo	7196c4e08a	new sampling API	2023-10-18 16:50:09 -04:00
Steward Garcia	84b8f2b060	Merge branch 'ggerganov:master' into master	2023-10-18 08:43:17 -04:00
FSSRepo	35fd37430f	fix zig build	2023-10-17 18:04:26 -04:00
FSSRepo	c02c52efb5	fix multiple clients	2023-10-17 17:54:56 -04:00
FSSRepo	d2b1fac6c7	fix make bui;d errors	2023-10-17 17:18:56 -04:00
FSSRepo	ed0c11cb83	multimodal support enabled by default	2023-10-17 16:58:20 -04:00
FSSRepo	6c277eaab5	update api like OpenAI	2023-10-17 16:53:38 -04:00
FSSRepo	58f8ae9bfe	readme change	2023-10-17 16:32:19 -04:00
FSSRepo	fa0f22f14f	Merge remote-tracking branch 'upstream/master'	2023-10-17 16:31:33 -04:00
FSSRepo	aa2268f4cd	sync README.md changes	2023-10-17 16:21:05 -04:00
FSSRepo	4d1804330e	fix llava implementation	2023-10-16 16:31:17 -04:00
FSSRepo	d7eca255d7	context shift fixed	2023-10-16 14:43:10 -04:00
FSSRepo	2d9f11db28	fixed premature end due stop word	2023-10-16 12:36:05 -04:00
FSSRepo	fd64f04fc2	fix long prompt than ctx proposed in #3639	2023-10-15 19:07:18 -04:00
FSSRepo	b727e022d6	fix ci make build undefined ref errors	2023-10-15 18:53:48 -04:00
FSSRepo	ce961a304b	some ci fixes	2023-10-15 18:46:01 -04:00
Steward Garcia	9035978aae	Merge pull request #6 from damian0815/fssrepo_mac_fixes fix compilation errors with llvm	2023-10-15 18:38:52 -04:00
Steward Garcia	f47fd17b73	Merge branch 'ggerganov:master' into master	2023-10-15 18:23:47 -04:00
FSSRepo	4e5c5c451c	notify the user from server ui that multimodality is unavialable	2023-10-14 08:28:49 -04:00
Damian Stewart	299f6b54d8	fix compilation errors with llvm	2023-10-14 11:17:38 +02:00
FSSRepo	7e64bfe060	refactor code + remove unused comments + improved README.md	2023-10-14 00:31:34 -04:00
FSSRepo	9f72b44635	add multimodal input - alfa	2023-10-13 23:36:32 -04:00
FSSRepo	de35b47908	fixed tokens probs	2023-10-13 19:55:25 -04:00
FSSRepo	9d98cdda2c	llava multimodal integration	2023-10-13 18:42:44 -04:00
FSSRepo	eb08201227	add changes to README.md	2023-10-13 14:28:06 -04:00
FSSRepo	a2c2d98c16	add context swap	2023-10-13 14:12:50 -04:00
FSSRepo	b6d9e212e5	fixed timings per slot	2023-10-13 13:10:38 -04:00
FSSRepo	a410a9e300	unused change reverted	2023-10-13 12:23:58 -04:00
FSSRepo	6358ae5f48	server ui now support multiple clients	2023-10-13 12:22:54 -04:00
FSSRepo	4ba5a5013d	chat.mjs support cached prompt + some fixes	2023-10-13 11:06:41 -04:00
FSSRepo	500ac7120e	cached prompt support	2023-10-12 21:16:12 -04:00
FSSRepo	83c2b3553a	grammar + no stream completion	2023-10-12 18:43:57 -04:00
FSSRepo	5b8e29de53	multiple client support	2023-10-12 17:09:12 -04:00
FSSRepo	81484805f0	completion endpoint working	2023-10-12 16:17:27 -04:00
FSSRepo	29c8cdd65d	refactored sampling function	2023-10-12 15:02:19 -04:00
FSSRepo	b716eeb72a	Merge branch 'master' of https://github.com/ggerganov/llama.cpp	2023-10-12 12:55:08 -04:00
FSSRepo	78504218b9	save dev progress	2023-10-12 12:51:48 -04:00
FSSRepo	471230202d	crash fixed	2023-10-11 19:48:15 -04:00
FSSRepo	63f99b1ea6	implementing parallel decoding in server example	2023-10-11 18:14:11 -04:00