diff --git a/.devops/nix/sif.nix b/.devops/nix/sif.nix index 7535ca0f3..7a5e1dd0f 100644 --- a/.devops/nix/sif.nix +++ b/.devops/nix/sif.nix @@ -7,7 +7,7 @@ }: let - optionalInt = cond: x: if cond then x else 0; + optionalInt = cond: x: if cond then x else 0; in singularity-tools.buildImage rec { inherit (llama-cpp) name; diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index 92e1108b3..b82205992 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -3,12 +3,14 @@ name: Python check requirements.txt on: push: paths: + - '.github/workflows/python-check-requirements.yml' - 'scripts/check-requirements.sh' - 'convert*.py' - 'requirements.txt' - 'requirements/*.txt' pull_request: paths: + - '.github/workflows/python-check-requirements.yml' - 'scripts/check-requirements.sh' - 'convert*.py' - 'requirements.txt' @@ -26,4 +28,4 @@ jobs: with: python-version: "3.11" - name: Run check-requirements.sh script - run: bash scripts/check-requirements.sh nocleanup + run: bash scripts/check-requirements.sh diff --git a/README.md b/README.md index 5401e197f..67717c1e3 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Hot topics +- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761)) - Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) - Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631 - Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590 diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index d3e8ec1f6..28b92ac38 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -96,9 +96,11 @@ class Model: if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None: + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_eps(f_norm_eps) if (n_experts := self.hparams.get("num_local_experts")) is not None: self.gguf_writer.add_expert_count(n_experts) @@ -220,6 +222,8 @@ class Model: return NomicBertModel if model_architecture == "GemmaForCausalLM": return GemmaModel + if model_architecture == "Starcoder2ForCausalLM": + return Model return Model def _is_model_safetensors(self) -> bool: @@ -281,6 +285,8 @@ class Model: return gguf.MODEL_ARCH.NOMIC_BERT if arch == "GemmaForCausalLM": return gguf.MODEL_ARCH.GEMMA + if arch == "Starcoder2ForCausalLM": + return gguf.MODEL_ARCH.STARCODER2 raise NotImplementedError(f'Architecture "{arch}" not supported!') diff --git a/examples/server/README.md b/examples/server/README.md index 128a7dd58..1bc4933cd 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -342,7 +342,7 @@ Notice that each `probs` is an array of length `n_probs`. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) -- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served. +- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. *Options:* @@ -544,20 +544,7 @@ bash chat.sh ### API like OAI -API example using Python Flask: [api_like_OAI.py](api_like_OAI.py) -This example must be used with server.cpp - -```sh -python api_like_OAI.py -``` - -After running the API server, you can use it in Python by setting the API base URL. - -```python -openai.api_base = "http://:port" -``` - -Then you can utilize llama.cpp as an OpenAI **chat.completion** or **text_completion** API +The HTTP server supports OAI-like API ### Extending or building alternative Web Front End diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py deleted file mode 100755 index 607fe49d3..000000000 --- a/examples/server/api_like_OAI.py +++ /dev/null @@ -1,228 +0,0 @@ -#!/usr/bin/env python3 -import argparse -from flask import Flask, jsonify, request, Response -import urllib.parse -import requests -import time -import json - - -app = Flask(__name__) -slot_id = -1 - -parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.") -parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.') -parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: 'USER: ')", default="USER: ") -parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: 'ASSISTANT: ')", default="ASSISTANT: ") -parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: 'ASSISTANT's RULE: ')", default="ASSISTANT's RULE: ") -parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '')", default="") -parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080') -parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="") -parser.add_argument("--host", type=str, help="Set the ip address to listen.(default: 127.0.0.1)", default='127.0.0.1') -parser.add_argument("--port", type=int, help="Set the port to listen.(default: 8081)", default=8081) - -args = parser.parse_args() - -def is_present(json, key): - try: - buf = json[key] - except KeyError: - return False - if json[key] == None: - return False - return True - -#convert chat to prompt -def convert_chat(messages): - - system_n = args.system_name - user_n = args.user_name - ai_n = args.ai_name - stop = args.stop - - prompt = "" + args.chat_prompt + stop - - for line in messages: - if (line["role"] == "system"): - prompt += f"{system_n}{line['content']}{stop}" - if (line["role"] == "user"): - prompt += f"{user_n}{line['content']}{stop}" - if (line["role"] == "assistant"): - prompt += f"{ai_n}{line['content']}{stop}" - prompt += ai_n.rstrip() - - return prompt - -def make_postData(body, chat=False, stream=False): - postData = {} - if (chat): - postData["prompt"] = convert_chat(body["messages"]) - else: - postData["prompt"] = body["prompt"] - if(is_present(body, "temperature")): postData["temperature"] = body["temperature"] - if(is_present(body, "top_k")): postData["top_k"] = body["top_k"] - if(is_present(body, "top_p")): postData["top_p"] = body["top_p"] - if(is_present(body, "max_tokens")): postData["n_predict"] = body["max_tokens"] - if(is_present(body, "presence_penalty")): postData["presence_penalty"] = body["presence_penalty"] - if(is_present(body, "frequency_penalty")): postData["frequency_penalty"] = body["frequency_penalty"] - if(is_present(body, "repeat_penalty")): postData["repeat_penalty"] = body["repeat_penalty"] - if(is_present(body, "mirostat")): postData["mirostat"] = body["mirostat"] - if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"] - if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"] - if(is_present(body, "seed")): postData["seed"] = body["seed"] - if(is_present(body, "grammar")): postData["grammar"] = body["grammar"] - if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()] - if (args.stop != ""): - postData["stop"] = [args.stop] - else: - postData["stop"] = [] - if(is_present(body, "stop")): postData["stop"] += body["stop"] - postData["n_keep"] = -1 - postData["stream"] = stream - postData["cache_prompt"] = True - postData["slot_id"] = slot_id - return postData - -def make_resData(data, chat=False, promptToken=[]): - resData = { - "id": "chatcmpl" if (chat) else "cmpl", - "object": "chat.completion" if (chat) else "text_completion", - "created": int(time.time()), - "truncated": data["truncated"], - "model": "LLaMA_CPP", - "usage": { - "prompt_tokens": data["tokens_evaluated"], - "completion_tokens": data["tokens_predicted"], - "total_tokens": data["tokens_evaluated"] + data["tokens_predicted"] - } - } - if (len(promptToken) != 0): - resData["promptToken"] = promptToken - if (chat): - #only one choice is supported - resData["choices"] = [{ - "index": 0, - "message": { - "role": "assistant", - "content": data["content"], - }, - "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length" - }] - else: - #only one choice is supported - resData["choices"] = [{ - "text": data["content"], - "index": 0, - "logprobs": None, - "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length" - }] - return resData - -def make_resData_stream(data, chat=False, time_now = 0, start=False): - resData = { - "id": "chatcmpl" if (chat) else "cmpl", - "object": "chat.completion.chunk" if (chat) else "text_completion.chunk", - "created": time_now, - "model": "LLaMA_CPP", - "choices": [ - { - "finish_reason": None, - "index": 0 - } - ] - } - slot_id = data.get("slot_id") - if (chat): - if (start): - resData["choices"][0]["delta"] = { - "role": "assistant" - } - else: - resData["choices"][0]["delta"] = { - "content": data["content"] - } - if (data["stop"]): - resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length" - else: - resData["choices"][0]["text"] = data["content"] - if (data["stop"]): - resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length" - - return resData - - -@app.route('/chat/completions', methods=['POST', 'OPTIONS']) -@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS']) -def chat_completions(): - if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key): - return Response(status=403) - if request.method == 'OPTIONS': - return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"}) - body = request.get_json() - stream = False - tokenize = False - if(is_present(body, "stream")): stream = body["stream"] - if(is_present(body, "tokenize")): tokenize = body["tokenize"] - postData = make_postData(body, chat=True, stream=stream) - - promptToken = [] - if (tokenize): - tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json() - promptToken = tokenData["tokens"] - - if (not stream): - data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData)) - print(data.json()) - resData = make_resData(data.json(), chat=True, promptToken=promptToken) - return jsonify(resData) - else: - def generate(): - data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True) - time_now = int(time.time()) - resData = make_resData_stream({}, chat=True, time_now=time_now, start=True) - yield 'data: {}\n\n'.format(json.dumps(resData)) - for line in data.iter_lines(): - if line: - decoded_line = line.decode('utf-8') - resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now) - yield 'data: {}\n\n'.format(json.dumps(resData)) - return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"}) - - -@app.route('/completions', methods=['POST', 'OPTIONS']) -@app.route('/v1/completions', methods=['POST', 'OPTIONS']) -def completion(): - if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key): - return Response(status=403) - if request.method == 'OPTIONS': - return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"}) - body = request.get_json() - stream = False - tokenize = False - if(is_present(body, "stream")): stream = body["stream"] - if(is_present(body, "tokenize")): tokenize = body["tokenize"] - postData = make_postData(body, chat=False, stream=stream) - - promptToken = [] - if (tokenize): - tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json() - promptToken = tokenData["tokens"] - - if (not stream): - data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData)) - print(data.json()) - resData = make_resData(data.json(), chat=False, promptToken=promptToken) - return jsonify(resData) - else: - def generate(): - data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True) - time_now = int(time.time()) - for line in data.iter_lines(): - if line: - decoded_line = line.decode('utf-8') - resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now) - yield 'data: {}\n\n'.format(json.dumps(resData)) - return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"}) - -if __name__ == '__main__': - app.run(args.host, port=args.port) diff --git a/flake.nix b/flake.nix index dc4e503c3..45f9deda0 100644 --- a/flake.nix +++ b/flake.nix @@ -107,11 +107,12 @@ # ``` # # Cf. https://nixos.org/manual/nix/unstable/command-ref/new-cli/nix3-flake.html?highlight=flake#flake-format - flake.overlays.default = - (final: prev: { + flake.overlays.default = ( + final: prev: { llamaPackages = final.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; inherit (final.llamaPackages) llama-cpp; - }); + } + ); systems = [ "aarch64-darwin" @@ -131,6 +132,9 @@ ... }: { + # For standardised reproducible formatting with `nix fmt` + formatter = pkgs.nixfmt-rfc-style; + # Unlike `.#packages`, legacyPackages may contain values of # arbitrary types (including nested attrsets) and may even throw # exceptions. This attribute isn't recursed into by `nix flake diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 8f9139d1b..5db760cb1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -112,6 +112,7 @@ class MODEL_ARCH(IntEnum): INTERNLM2 = auto() MINICPM = auto() GEMMA = auto() + STARCODER2 = auto() class MODEL_TENSOR(IntEnum): @@ -169,6 +170,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.MINICPM: "minicpm", MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.STARCODER2: "starcoder2", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -526,6 +528,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_NORM, ], + MODEL_ARCH.STARCODER2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } @@ -554,6 +571,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.STARCODER2: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], } # diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 861003776..db2ec9704 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -210,6 +210,7 @@ class TensorNameMap: "model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.{bid}.feed_forward.w3", # internlm2 "encoder.layers.{bid}.mlp.fc11", # nomic-bert + "model.layers.{bid}.mlp.c_fc", # starcoder2 ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -256,6 +257,7 @@ class TensorNameMap: "model.layers.layers.{bid}.mlp.down_proj", # plamo "model.layers.{bid}.feed_forward.w2", # internlm2 "encoder.layers.{bid}.mlp.fc2", # nomic-bert + "model.layers.{bid}.mlp.c_proj", # starcoder2 ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/llama.cpp b/llama.cpp index 099dc0a84..737dd77c0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -211,6 +211,7 @@ enum llm_arch { LLM_ARCH_INTERNLM2, LLM_ARCH_MINICPM, LLM_ARCH_GEMMA, + LLM_ARCH_STARCODER2, LLM_ARCH_UNKNOWN, }; @@ -238,6 +239,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_MINICPM, "minicpm" }, { LLM_ARCH_GEMMA, "gemma" }, + { LLM_ARCH_STARCODER2, "starcoder2" }, }; enum llm_kv { @@ -779,6 +781,24 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_STARCODER2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -3320,6 +3340,16 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_STARCODER2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 30: model.type = e_model::MODEL_3B; break; + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_15B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -4490,6 +4520,56 @@ static bool llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); } } break; + case LLM_ARCH_STARCODER2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } + + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + // optional bias tensors + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + + // optional bias tensors + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -7559,6 +7639,120 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_starcoder2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -7705,6 +7899,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_gemma(); } break; + case LLM_ARCH_STARCODER2: + { + result = llm.build_starcoder2(); + } break; default: GGML_ASSERT(false); } @@ -12084,6 +12282,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_QWEN2: case LLM_ARCH_PHI2: case LLM_ARCH_GEMMA: + case LLM_ARCH_STARCODER2: return LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here diff --git a/requirements/requirements-convert-hf-to-gguf.txt b/requirements/requirements-convert-hf-to-gguf.txt index 6ac402610..6ce840d73 100644 --- a/requirements/requirements-convert-hf-to-gguf.txt +++ b/requirements/requirements-convert-hf-to-gguf.txt @@ -1,2 +1,3 @@ -r ./requirements-convert.txt torch~=2.1.1 +einops~=0.7.0