Merge branch 'master' into update-server-instructions
This commit is contained in:
commit
1dd61d2aa3
12 changed files with 1300 additions and 1166 deletions
|
@ -68,8 +68,9 @@ option(LLAMA_ACCELERATE "llama: enable Accelerate framework
|
||||||
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
option(LLAMA_BLAS "llama: use BLAS" OFF)
|
||||||
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
|
||||||
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
|
||||||
|
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||||
set(LLAMA_CUDA_DMMV_Y "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
|
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||||
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
option(LLAMA_CUDA_DMMV_F16 "llama: use 16 bit floats for dmmv CUDA kernels" OFF)
|
||||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||||
|
@ -246,8 +247,14 @@ if (LLAMA_CUBLAS)
|
||||||
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_CUBLAS)
|
||||||
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||||
|
endif()
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
|
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
||||||
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_DMMV_Y}) # for backwards compatibility
|
||||||
|
endif()
|
||||||
if (LLAMA_CUDA_DMMV_F16)
|
if (LLAMA_CUDA_DMMV_F16)
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_F16)
|
add_compile_definitions(GGML_CUDA_DMMV_F16)
|
||||||
endif()
|
endif()
|
||||||
|
@ -263,7 +270,7 @@ if (LLAMA_CUBLAS)
|
||||||
if (LLAMA_CUDA_DMMV_F16)
|
if (LLAMA_CUDA_DMMV_F16)
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
|
set(CMAKE_CUDA_ARCHITECTURES "61") # needed for f16 CUDA intrinsics
|
||||||
else()
|
else()
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "52") # lowest CUDA 12 standard
|
set(CMAKE_CUDA_ARCHITECTURES "52;61") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||||
|
|
13
Makefile
13
Makefile
|
@ -164,16 +164,21 @@ ifdef LLAMA_CUBLAS
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
NVCC = nvcc
|
NVCC = nvcc
|
||||||
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
|
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
|
||||||
|
ifdef LLAMA_CUDA_FORCE_DMMV
|
||||||
|
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
|
||||||
|
endif # LLAMA_CUDA_FORCE_DMMV
|
||||||
ifdef LLAMA_CUDA_DMMV_X
|
ifdef LLAMA_CUDA_DMMV_X
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
|
NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
|
||||||
endif # LLAMA_CUDA_DMMV_X
|
endif # LLAMA_CUDA_DMMV_X
|
||||||
ifdef LLAMA_CUDA_DMMV_Y
|
ifdef LLAMA_CUDA_MMV_Y
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=$(LLAMA_CUDA_DMMV_Y)
|
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
|
||||||
|
else ifdef LLAMA_CUDA_DMMV_Y
|
||||||
|
NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_DMMV_Y) # for backwards compatibility
|
||||||
else
|
else
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_Y=1
|
NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
|
||||||
endif # LLAMA_CUDA_DMMV_Y
|
endif # LLAMA_CUDA_MMV_Y
|
||||||
ifdef LLAMA_CUDA_DMMV_F16
|
ifdef LLAMA_CUDA_DMMV_F16
|
||||||
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
|
NVCCFLAGS += -DGGML_CUDA_DMMV_F16
|
||||||
endif # LLAMA_CUDA_DMMV_F16
|
endif # LLAMA_CUDA_DMMV_F16
|
||||||
|
|
|
@ -11,6 +11,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
||||||
|
|
||||||
**Hot topics:**
|
**Hot topics:**
|
||||||
|
|
||||||
|
- Simple web chat example: https://github.com/ggerganov/llama.cpp/pull/1998
|
||||||
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
|
- k-quants now support super-block size of 64: https://github.com/ggerganov/llama.cpp/pull/2001
|
||||||
- New roadmap: https://github.com/users/ggerganov/projects/7
|
- New roadmap: https://github.com/users/ggerganov/projects/7
|
||||||
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
|
- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
|
||||||
|
@ -344,8 +345,9 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|-------------------------|------------------------|---------|-------------|
|
|-------------------------|------------------------|---------|-------------|
|
||||||
|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 7.0/Turing/RTX 2000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_Y | Positive integer | 1 | Block size in y direction for the CUDA dequantization + mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
|
| LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,7 @@ struct MyModel* create_mymodel(int argc, char ** argv) {
|
||||||
|
|
||||||
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
fprintf(stderr, "%s: build = %d (%s)\n", __func__, BUILD_NUMBER, BUILD_COMMIT);
|
||||||
|
|
||||||
if (params.seed < 0) {
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# llama.cpp/example/server
|
# llama.cpp/example/server
|
||||||
|
|
||||||
This example demonstrates a simple HTTP API server and a simple web front end to interact with llama.cpp.
|
This example demonstrates a simple HTTP API server and a simple web front end and a simple web front end to interact with llama.cpp.
|
||||||
|
|
||||||
Command line options:
|
Command line options:
|
||||||
|
|
||||||
|
@ -192,6 +192,38 @@ Run with bash:
|
||||||
bash chat.sh
|
bash chat.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### API like OAI
|
||||||
|
|
||||||
|
API example using Python Flask: [api_like_OAI.py](api_like_OAI.py)
|
||||||
|
This example must be used with server.cpp
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python api_like_OAI.py
|
||||||
|
```
|
||||||
|
|
||||||
|
After running the API server, you can use it in Python by setting the API base URL.
|
||||||
|
```python
|
||||||
|
openai.api_base = "http://<Your api-server IP>:port"
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
|
||||||
|
|
||||||
|
### API like OAI
|
||||||
|
|
||||||
|
API example using Python Flask: [api_like_OAI.py](api_like_OAI.py)
|
||||||
|
This example must be used with server.cpp
|
||||||
|
|
||||||
|
```sh
|
||||||
|
python api_like_OAI.py
|
||||||
|
```
|
||||||
|
|
||||||
|
After running the API server, you can use it in Python by setting the API base URL.
|
||||||
|
```python
|
||||||
|
openai.api_base = "http://<Your api-server IP>:port"
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API
|
||||||
|
|
||||||
### Extending the Web Front End
|
### Extending the Web Front End
|
||||||
|
|
||||||
The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method. A simple example is below:
|
The default location for the static files is `examples/server/public`. You can extend the front end by running the server binary with `--path` set to `./your-directory` and importing `/completion.js` to get access to the llamaComplete() method. A simple example is below:
|
||||||
|
|
219
examples/server/api_like_OAI.py
Executable file
219
examples/server/api_like_OAI.py
Executable file
|
@ -0,0 +1,219 @@
|
||||||
|
import argparse
|
||||||
|
from flask import Flask, jsonify, request, Response
|
||||||
|
import urllib.parse
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.")
|
||||||
|
parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.\\n')
|
||||||
|
parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: '\\nUSER: ')", default="\\nUSER: ")
|
||||||
|
parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: '\\nASSISTANT: ')", default="\\nASSISTANT: ")
|
||||||
|
parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: '\\nASSISTANT's RULE: ')", default="\\nASSISTANT's RULE: ")
|
||||||
|
parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '</s>')", default="</s>")
|
||||||
|
parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080')
|
||||||
|
parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="")
|
||||||
|
parser.add_argument("--host", type=str, help="Set the ip address to listen.(default: 127.0.0.1)", default='127.0.0.1')
|
||||||
|
parser.add_argument("--port", type=int, help="Set the port to listen.(default: 8081)", default=8081)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
def is_present(json, key):
|
||||||
|
try:
|
||||||
|
buf = json[key]
|
||||||
|
except KeyError:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#convert chat to prompt
|
||||||
|
def convert_chat(messages):
|
||||||
|
prompt = "" + args.chat_prompt.replace("\\n", "\n")
|
||||||
|
|
||||||
|
system_n = args.system_name.replace("\\n", "\n")
|
||||||
|
user_n = args.user_name.replace("\\n", "\n")
|
||||||
|
ai_n = args.ai_name.replace("\\n", "\n")
|
||||||
|
stop = args.stop.replace("\\n", "\n")
|
||||||
|
|
||||||
|
|
||||||
|
for line in messages:
|
||||||
|
if (line["role"] == "system"):
|
||||||
|
prompt += f"{system_n}{line['content']}"
|
||||||
|
if (line["role"] == "user"):
|
||||||
|
prompt += f"{user_n}{line['content']}"
|
||||||
|
if (line["role"] == "assistant"):
|
||||||
|
prompt += f"{ai_n}{line['content']}{stop}"
|
||||||
|
prompt += ai_n.rstrip()
|
||||||
|
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
def make_postData(body, chat=False, stream=False):
|
||||||
|
postData = {}
|
||||||
|
if (chat):
|
||||||
|
postData["prompt"] = convert_chat(body["messages"])
|
||||||
|
else:
|
||||||
|
postData["prompt"] = body["prompt"]
|
||||||
|
if(is_present(body, "temperature")): postData["temperature"] = body["temperature"]
|
||||||
|
if(is_present(body, "top_k")): postData["top_k"] = body["top_k"]
|
||||||
|
if(is_present(body, "top_p")): postData["top_p"] = body["top_p"]
|
||||||
|
if(is_present(body, "max_tokens")): postData["n_predict"] = body["max_tokens"]
|
||||||
|
if(is_present(body, "presence_penalty")): postData["presence_penalty"] = body["presence_penalty"]
|
||||||
|
if(is_present(body, "frequency_penalty")): postData["frequency_penalty"] = body["frequency_penalty"]
|
||||||
|
if(is_present(body, "repeat_penalty")): postData["repeat_penalty"] = body["repeat_penalty"]
|
||||||
|
if(is_present(body, "mirostat")): postData["mirostat"] = body["mirostat"]
|
||||||
|
if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"]
|
||||||
|
if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"]
|
||||||
|
if(is_present(body, "seed")): postData["seed"] = body["seed"]
|
||||||
|
if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()]
|
||||||
|
if (args.stop != ""):
|
||||||
|
postData["stop"] = [args.stop]
|
||||||
|
else:
|
||||||
|
postData["stop"] = []
|
||||||
|
if(is_present(body, "stop")): postData["stop"] += body["stop"]
|
||||||
|
postData["n_keep"] = -1
|
||||||
|
postData["stream"] = stream
|
||||||
|
|
||||||
|
return postData
|
||||||
|
|
||||||
|
def make_resData(data, chat=False, promptToken=[]):
|
||||||
|
resData = {
|
||||||
|
"id": "chatcmpl" if (chat) else "cmpl",
|
||||||
|
"object": "chat.completion" if (chat) else "text_completion",
|
||||||
|
"created": int(time.time()),
|
||||||
|
"truncated": data["truncated"],
|
||||||
|
"model": "LLaMA_CPP",
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": data["tokens_evaluated"],
|
||||||
|
"completion_tokens": data["tokens_predicted"],
|
||||||
|
"total_tokens": data["tokens_evaluated"] + data["tokens_predicted"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (len(promptToken) != 0):
|
||||||
|
resData["promptToken"] = promptToken
|
||||||
|
if (chat):
|
||||||
|
#only one choice is supported
|
||||||
|
resData["choices"] = [{
|
||||||
|
"index": 0,
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": data["content"],
|
||||||
|
},
|
||||||
|
"finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
|
||||||
|
}]
|
||||||
|
else:
|
||||||
|
#only one choice is supported
|
||||||
|
resData["choices"] = [{
|
||||||
|
"text": data["content"],
|
||||||
|
"index": 0,
|
||||||
|
"logprobs": None,
|
||||||
|
"finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
|
||||||
|
}]
|
||||||
|
return resData
|
||||||
|
|
||||||
|
def make_resData_stream(data, chat=False, time_now = 0, start=False):
|
||||||
|
resData = {
|
||||||
|
"id": "chatcmpl" if (chat) else "cmpl",
|
||||||
|
"object": "chat.completion.chunk" if (chat) else "text_completion.chunk",
|
||||||
|
"created": time_now,
|
||||||
|
"model": "LLaMA_CPP",
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"finish_reason": None,
|
||||||
|
"index": 0
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
if (chat):
|
||||||
|
if (start):
|
||||||
|
resData["choices"][0]["delta"] = {
|
||||||
|
"role": "assistant"
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
resData["choices"][0]["delta"] = {
|
||||||
|
"content": data["content"]
|
||||||
|
}
|
||||||
|
if (data["stop"]):
|
||||||
|
resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
|
||||||
|
else:
|
||||||
|
resData["choices"][0]["text"] = data["content"]
|
||||||
|
if (data["stop"]):
|
||||||
|
resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length"
|
||||||
|
|
||||||
|
return resData
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/chat/completions', methods=['POST'])
|
||||||
|
@app.route('/v1/chat/completions', methods=['POST'])
|
||||||
|
def chat_completions():
|
||||||
|
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
||||||
|
return Response(status=403)
|
||||||
|
body = request.get_json()
|
||||||
|
stream = False
|
||||||
|
tokenize = False
|
||||||
|
if(is_present(body, "stream")): stream = body["stream"]
|
||||||
|
if(is_present(body, "tokenize")): tokenize = body["tokenize"]
|
||||||
|
postData = make_postData(body, chat=True, stream=stream)
|
||||||
|
|
||||||
|
promptToken = []
|
||||||
|
if (tokenize):
|
||||||
|
tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
|
||||||
|
promptToken = tokenData["tokens"]
|
||||||
|
|
||||||
|
if (not stream):
|
||||||
|
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
|
||||||
|
print(data.json())
|
||||||
|
resData = make_resData(data.json(), chat=True, promptToken=promptToken)
|
||||||
|
return jsonify(resData)
|
||||||
|
else:
|
||||||
|
def generate():
|
||||||
|
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
|
||||||
|
time_now = int(time.time())
|
||||||
|
resData = make_resData_stream({}, chat=True, time_now=time_now, start=True)
|
||||||
|
yield 'data: {}\n'.format(json.dumps(resData))
|
||||||
|
for line in data.iter_lines():
|
||||||
|
if line:
|
||||||
|
decoded_line = line.decode('utf-8')
|
||||||
|
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now)
|
||||||
|
yield 'data: {}\n'.format(json.dumps(resData))
|
||||||
|
return Response(generate(), mimetype='text/event-stream')
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/completions', methods=['POST'])
|
||||||
|
@app.route('/v1/completions', methods=['POST'])
|
||||||
|
def completion():
|
||||||
|
if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key):
|
||||||
|
return Response(status=403)
|
||||||
|
body = request.get_json()
|
||||||
|
stream = False
|
||||||
|
tokenize = False
|
||||||
|
if(is_present(body, "stream")): stream = body["stream"]
|
||||||
|
if(is_present(body, "tokenize")): tokenize = body["tokenize"]
|
||||||
|
postData = make_postData(body, chat=False, stream=stream)
|
||||||
|
|
||||||
|
promptToken = []
|
||||||
|
if (tokenize):
|
||||||
|
tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json()
|
||||||
|
promptToken = tokenData["tokens"]
|
||||||
|
|
||||||
|
if (not stream):
|
||||||
|
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData))
|
||||||
|
print(data.json())
|
||||||
|
resData = make_resData(data.json(), chat=False, promptToken=promptToken)
|
||||||
|
return jsonify(resData)
|
||||||
|
else:
|
||||||
|
def generate():
|
||||||
|
data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True)
|
||||||
|
time_now = int(time.time())
|
||||||
|
for line in data.iter_lines():
|
||||||
|
if line:
|
||||||
|
decoded_line = line.decode('utf-8')
|
||||||
|
resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now)
|
||||||
|
yield 'data: {}\n'.format(json.dumps(resData))
|
||||||
|
return Response(generate(), mimetype='text/event-stream')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(args.host, port=args.port)
|
|
@ -158,6 +158,7 @@ struct llama_server_context {
|
||||||
std::string generated_text;
|
std::string generated_text;
|
||||||
std::vector<completion_token_output> generated_token_probs;
|
std::vector<completion_token_output> generated_token_probs;
|
||||||
|
|
||||||
|
size_t num_prompt_tokens = 0;
|
||||||
size_t num_tokens_predicted = 0;
|
size_t num_tokens_predicted = 0;
|
||||||
size_t n_past = 0;
|
size_t n_past = 0;
|
||||||
size_t n_remain = 0;
|
size_t n_remain = 0;
|
||||||
|
@ -195,6 +196,7 @@ struct llama_server_context {
|
||||||
|
|
||||||
void rewind() {
|
void rewind() {
|
||||||
params.antiprompt.clear();
|
params.antiprompt.clear();
|
||||||
|
num_prompt_tokens = 0;
|
||||||
num_tokens_predicted = 0;
|
num_tokens_predicted = 0;
|
||||||
generated_text = "";
|
generated_text = "";
|
||||||
generated_text.reserve(params.n_ctx);
|
generated_text.reserve(params.n_ctx);
|
||||||
|
@ -226,17 +228,18 @@ struct llama_server_context {
|
||||||
void loadPrompt() {
|
void loadPrompt() {
|
||||||
params.prompt.insert(0, 1, ' '); // always add a first space
|
params.prompt.insert(0, 1, ' '); // always add a first space
|
||||||
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
|
||||||
|
num_prompt_tokens = prompt_tokens.size();
|
||||||
|
|
||||||
if (params.n_keep < 0) {
|
if (params.n_keep < 0) {
|
||||||
params.n_keep = (int)prompt_tokens.size();
|
params.n_keep = (int)num_prompt_tokens;
|
||||||
}
|
}
|
||||||
params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
|
params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
|
||||||
|
|
||||||
// if input prompt is too big, truncate like normal
|
// if input prompt is too big, truncate like normal
|
||||||
if (prompt_tokens.size() >= (size_t)params.n_ctx) {
|
if (num_prompt_tokens>= (size_t)params.n_ctx) {
|
||||||
const int n_left = (params.n_ctx - params.n_keep) / 2;
|
const int n_left = (params.n_ctx - params.n_keep) / 2;
|
||||||
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
|
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
|
||||||
const int erased_blocks = (prompt_tokens.size() - params.n_keep - n_left - 1) / n_left;
|
const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
|
||||||
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
|
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
|
||||||
std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
|
std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
|
||||||
|
|
||||||
|
@ -250,7 +253,7 @@ struct llama_server_context {
|
||||||
truncated = true;
|
truncated = true;
|
||||||
prompt_tokens = new_tokens;
|
prompt_tokens = new_tokens;
|
||||||
} else {
|
} else {
|
||||||
const size_t ps = prompt_tokens.size();
|
const size_t ps = num_prompt_tokens;
|
||||||
std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
|
std::fill(last_n_tokens.begin(), last_n_tokens.end() - ps, 0);
|
||||||
std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
|
std::copy(prompt_tokens.begin(), prompt_tokens.end(), last_n_tokens.end() - ps);
|
||||||
}
|
}
|
||||||
|
@ -258,7 +261,7 @@ struct llama_server_context {
|
||||||
// compare the evaluated prompt with the new prompt
|
// compare the evaluated prompt with the new prompt
|
||||||
n_past = common_part(embd, prompt_tokens);
|
n_past = common_part(embd, prompt_tokens);
|
||||||
embd = prompt_tokens;
|
embd = prompt_tokens;
|
||||||
if (n_past == prompt_tokens.size()) {
|
if (n_past == num_prompt_tokens) {
|
||||||
// we have to evaluate at least 1 token to generate logits.
|
// we have to evaluate at least 1 token to generate logits.
|
||||||
n_past--;
|
n_past--;
|
||||||
}
|
}
|
||||||
|
@ -763,6 +766,7 @@ static json format_final_response(llama_server_context & llama, const std::strin
|
||||||
{ "stop", true },
|
{ "stop", true },
|
||||||
{ "model", llama.params.model_alias },
|
{ "model", llama.params.model_alias },
|
||||||
{ "tokens_predicted", llama.num_tokens_predicted },
|
{ "tokens_predicted", llama.num_tokens_predicted },
|
||||||
|
{ "tokens_evaluated", llama.num_prompt_tokens },
|
||||||
{ "generation_settings", format_generation_settings(llama) },
|
{ "generation_settings", format_generation_settings(llama) },
|
||||||
{ "prompt", llama.params.prompt },
|
{ "prompt", llama.params.prompt },
|
||||||
{ "truncated", llama.truncated },
|
{ "truncated", llama.truncated },
|
||||||
|
|
377
ggml-cuda.cu
377
ggml-cuda.cu
|
@ -70,9 +70,11 @@ typedef void (*ggml_cuda_op_t)(
|
||||||
|
|
||||||
// QK = number of values after dequantization
|
// QK = number of values after dequantization
|
||||||
// QR = QK / number of values before dequantization
|
// QR = QK / number of values before dequantization
|
||||||
|
// QI = number of 32 bit integers before dequantization
|
||||||
|
|
||||||
#define QK4_0 32
|
#define QK4_0 32
|
||||||
#define QR4_0 2
|
#define QR4_0 2
|
||||||
|
#define QI4_0 4
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // delta
|
half d; // delta
|
||||||
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
||||||
|
@ -81,6 +83,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
|
||||||
|
|
||||||
#define QK4_1 32
|
#define QK4_1 32
|
||||||
#define QR4_1 2
|
#define QR4_1 2
|
||||||
|
#define QI4_1 4
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // delta
|
half d; // delta
|
||||||
half m; // min
|
half m; // min
|
||||||
|
@ -90,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
|
||||||
|
|
||||||
#define QK5_0 32
|
#define QK5_0 32
|
||||||
#define QR5_0 2
|
#define QR5_0 2
|
||||||
|
#define QI5_0 4
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // delta
|
half d; // delta
|
||||||
uint8_t qh[4]; // 5-th bit of quants
|
uint8_t qh[4]; // 5-th bit of quants
|
||||||
|
@ -99,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
|
||||||
|
|
||||||
#define QK5_1 32
|
#define QK5_1 32
|
||||||
#define QR5_1 2
|
#define QR5_1 2
|
||||||
|
#define QI5_1 4
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // delta
|
half d; // delta
|
||||||
half m; // min
|
half m; // min
|
||||||
|
@ -109,12 +114,25 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
|
||||||
|
|
||||||
#define QK8_0 32
|
#define QK8_0 32
|
||||||
#define QR8_0 1
|
#define QR8_0 1
|
||||||
|
#define QI8_0 8
|
||||||
typedef struct {
|
typedef struct {
|
||||||
half d; // delta
|
half d; // delta
|
||||||
int8_t qs[QK8_0]; // quants
|
int8_t qs[QK8_0]; // quants
|
||||||
} block_q8_0;
|
} block_q8_0;
|
||||||
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
||||||
|
|
||||||
|
#define QK8_1 32
|
||||||
|
#define QR8_1 1
|
||||||
|
#define QI8_1 8
|
||||||
|
typedef struct {
|
||||||
|
half d; // delta
|
||||||
|
half s; // unquantized sum
|
||||||
|
int8_t qs[QK8_0]; // quants
|
||||||
|
} block_q8_1;
|
||||||
|
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
||||||
|
|
||||||
|
typedef float (*vec_dot_q_cuda_t)(const void * vbq, const block_q8_1 * bq8_1, const int iqs);
|
||||||
|
|
||||||
//================================= k-quants
|
//================================= k-quants
|
||||||
|
|
||||||
#ifdef GGML_QKK_64
|
#ifdef GGML_QKK_64
|
||||||
|
@ -198,14 +216,15 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
||||||
#define CUDA_SCALE_BLOCK_SIZE 256
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
||||||
#define CUDA_ROPE_BLOCK_SIZE 256
|
#define CUDA_ROPE_BLOCK_SIZE 256
|
||||||
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
|
||||||
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
||||||
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
|
||||||
|
|
||||||
// dmmv = dequantize_mul_mat_vec
|
// dmmv = dequantize_mul_mat_vec
|
||||||
#ifndef GGML_CUDA_DMMV_X
|
#ifndef GGML_CUDA_DMMV_X
|
||||||
#define GGML_CUDA_DMMV_X 32
|
#define GGML_CUDA_DMMV_X 32
|
||||||
#endif
|
#endif
|
||||||
#ifndef GGML_CUDA_DMMV_Y
|
#ifndef GGML_CUDA_MMV_Y
|
||||||
#define GGML_CUDA_DMMV_Y 1
|
#define GGML_CUDA_MMV_Y 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef K_QUANTS_PER_ITERATION
|
#ifndef K_QUANTS_PER_ITERATION
|
||||||
|
@ -270,7 +289,6 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums
|
// sum up partial sums
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -714,7 +732,6 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * vx, const float
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -819,7 +836,6 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * vx, const float
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -923,7 +939,6 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * vx, const float
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1028,7 +1043,6 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * vx, const float
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1139,7 +1153,6 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * vx, const float
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1158,6 +1171,41 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
|
||||||
v.y = x[ib + iqs + 1];
|
v.y = x[ib + iqs + 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __global__ void quantize_q8_1(const float * x, void * vy, const int k) {
|
||||||
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (i >= k) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
block_q8_1 * y = (block_q8_1 *) vy;
|
||||||
|
|
||||||
|
const int ib = i / QK8_0; // block index
|
||||||
|
const int iqs = i % QK8_0; // quant index
|
||||||
|
|
||||||
|
const float xi = x[i];
|
||||||
|
float amax = fabsf(xi);
|
||||||
|
float sum = xi;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
|
||||||
|
sum += __shfl_xor_sync(0xffffffff, sum, mask, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float d = amax / 127;
|
||||||
|
const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
|
||||||
|
|
||||||
|
y[ib].qs[iqs] = q;
|
||||||
|
|
||||||
|
if (iqs > 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
y[ib].d = d;
|
||||||
|
y[ib].s = sum;
|
||||||
|
}
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||||
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
|
||||||
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
|
||||||
|
@ -1179,6 +1227,182 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
|
||||||
y[iybs + iqs + y_offset] = v.y;
|
y[iybs + iqs + y_offset] = v.y;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
||||||
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
|
const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
|
||||||
|
|
||||||
|
int vi;
|
||||||
|
memcpy(&vi, &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
||||||
|
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
|
||||||
|
|
||||||
|
const float d = __half2float(bq4_0->d) * __half2float(bq8_1->d);
|
||||||
|
|
||||||
|
// subtract 8 from each quantized value
|
||||||
|
const int vi0 = __vsub4((vi >> 0) & 0x0F0F0F0F, 0x08080808);
|
||||||
|
const int vi1 = __vsub4((vi >> 4) & 0x0F0F0F0F, 0x08080808);
|
||||||
|
|
||||||
|
// SIMD dot product of quantized values
|
||||||
|
int sumi = __dp4a(vi0, ui0, 0);
|
||||||
|
sumi = __dp4a(vi1, ui1, sumi);
|
||||||
|
|
||||||
|
return sumi*d;
|
||||||
|
#else
|
||||||
|
return 0.0f; // only to satisfy the compiler
|
||||||
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
||||||
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
|
const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
|
||||||
|
|
||||||
|
const int vi = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
|
||||||
|
|
||||||
|
const float d = __half2float(bq4_1->d) * __half2float(bq8_1->d);
|
||||||
|
const float m = bq4_1->m;
|
||||||
|
const float s = bq8_1->s;
|
||||||
|
|
||||||
|
const int vi0 = (vi >> 0) & 0x0F0F0F0F;
|
||||||
|
const int vi1 = (vi >> 4) & 0x0F0F0F0F;
|
||||||
|
|
||||||
|
// SIMD dot product of quantized values
|
||||||
|
int sumi = __dp4a(vi0, ui0, 0);
|
||||||
|
sumi = __dp4a(vi1, ui1, sumi);
|
||||||
|
|
||||||
|
return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
|
||||||
|
#else
|
||||||
|
return 0.0f; // only to satisfy the compiler
|
||||||
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
||||||
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
|
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
|
||||||
|
|
||||||
|
int qs;
|
||||||
|
memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
||||||
|
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
|
||||||
|
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
|
||||||
|
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
|
||||||
|
|
||||||
|
const float d = __half2float(bq5_0->d) * __half2float(bq8_1->d);
|
||||||
|
|
||||||
|
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
||||||
|
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
||||||
|
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
||||||
|
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
||||||
|
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
||||||
|
vi0 = __vsub4(vi0, 0x10101010); // subtract 16 from quantized values
|
||||||
|
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
||||||
|
|
||||||
|
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
||||||
|
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
||||||
|
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
||||||
|
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
||||||
|
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
||||||
|
vi1 = __vsub4(vi1, 0x10101010); // subtract 16 from quantized values
|
||||||
|
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
||||||
|
|
||||||
|
return sumi*d;
|
||||||
|
#else
|
||||||
|
return 0.0f; // only to satisfy the compiler
|
||||||
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
||||||
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
|
const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
|
||||||
|
|
||||||
|
const int qs = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
|
||||||
|
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
|
||||||
|
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
|
||||||
|
|
||||||
|
const float d = __half2float(bq5_1->d) * __half2float(bq8_1->d);
|
||||||
|
const float m = bq5_1->m;
|
||||||
|
const float s = bq8_1->s;
|
||||||
|
|
||||||
|
int vi0 = (qs >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
|
||||||
|
vi0 |= (qh0 << 4) & 0x00000010; // 1 -> 5
|
||||||
|
vi0 |= (qh0 << 11) & 0x00001000; // 2 -> 13
|
||||||
|
vi0 |= (qh0 << 18) & 0x00100000; // 3 -> 21
|
||||||
|
vi0 |= (qh0 << 25) & 0x10000000; // 4 -> 29
|
||||||
|
int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
|
||||||
|
|
||||||
|
int vi1 = (qs >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
|
||||||
|
vi1 |= (qh1 << 4) & 0x00000010; // 1 -> 5
|
||||||
|
vi1 |= (qh1 << 11) & 0x00001000; // 2 -> 13
|
||||||
|
vi1 |= (qh1 << 18) & 0x00100000; // 3 -> 21
|
||||||
|
vi1 |= (qh1 << 25) & 0x10000000; // 4 -> 29
|
||||||
|
sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
|
||||||
|
|
||||||
|
return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
|
||||||
|
#else
|
||||||
|
return 0.0f; // only to satisfy the compiler
|
||||||
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
|
||||||
|
#if __CUDA_ARCH__ >= 600 // lowest compute capability for integer intrinsics
|
||||||
|
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
|
||||||
|
|
||||||
|
int vi;
|
||||||
|
memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
|
||||||
|
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
|
||||||
|
|
||||||
|
const float d = __half2float(bq8_0->d) * __half2float(bq8_1->d);
|
||||||
|
|
||||||
|
// SIMD dot product of quantized values
|
||||||
|
int sumi = __dp4a(vi, ui, 0);
|
||||||
|
|
||||||
|
return sumi*d;
|
||||||
|
#else
|
||||||
|
return 0.0f; // only to satisfy the compiler
|
||||||
|
#endif // __CUDA_ARCH__ >= 600
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||||
|
static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
|
||||||
|
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
|
if (row >= nrows) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int blocks_per_row = ncols / qk;
|
||||||
|
const int blocks_per_warp = WARP_SIZE / qi;
|
||||||
|
|
||||||
|
// partial sum for each thread
|
||||||
|
float tmp = 0.0f;
|
||||||
|
|
||||||
|
const block_q_t * x = (const block_q_t *) vx;
|
||||||
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||||
|
|
||||||
|
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
||||||
|
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
|
||||||
|
|
||||||
|
const int iby = i + threadIdx.x / qi; // y block index
|
||||||
|
|
||||||
|
const int iqs = threadIdx.x % qi; // x block quant index when casting the quants to int
|
||||||
|
|
||||||
|
tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
|
||||||
|
}
|
||||||
|
|
||||||
|
// sum up partial sums and write back result
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
dst[row] = tmp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
||||||
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows) {
|
||||||
// qk = quantized weights per x block
|
// qk = quantized weights per x block
|
||||||
|
@ -1233,7 +1457,6 @@ static __global__ void dequantize_mul_mat_vec(const void * vx, const dfloat * y,
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1284,7 +1507,6 @@ static __global__ void mul_mat_p021_f16_f32(const void * vx, const float * y, fl
|
||||||
const int idst = channel*nrows_dst + row_dst;
|
const int idst = channel*nrows_dst + row_dst;
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1330,7 +1552,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums and write back result
|
// sum up partial sums and write back result
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1440,7 +1661,6 @@ static __global__ void soft_max_f32(const float * x, float * dst, const int ncol
|
||||||
}
|
}
|
||||||
|
|
||||||
// sum up partial sums
|
// sum up partial sums
|
||||||
__syncthreads();
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
tmp += __shfl_xor_sync(0xffffffff, tmp, mask, 32);
|
||||||
|
@ -1494,6 +1714,11 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
|
||||||
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
rms_norm_f32<<<nrows, block_dims, 0, stream>>>(x, dst, ncols);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void quantize_row_q8_1_cuda(const float * x, void * vy, const int k, cudaStream_t stream) {
|
||||||
|
const int num_blocks = (k + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
|
||||||
|
quantize_q8_1<<<num_blocks, CUDA_QUANTIZE_BLOCK_SIZE, 0, stream>>>(x, vy, k);
|
||||||
|
}
|
||||||
|
|
||||||
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||||
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
dequantize_block<QK4_0, QR4_0, dequantize_q4_0><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
|
@ -1562,45 +1787,45 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
@ -1647,6 +1872,51 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
||||||
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
|
||||||
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
|
||||||
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
|
||||||
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
|
||||||
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
|
||||||
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
|
}
|
||||||
|
|
||||||
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
|
||||||
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
|
||||||
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
|
||||||
|
@ -1654,9 +1924,9 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
|
||||||
|
|
||||||
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(1, block_num_y, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<1, 1, convert_f16>
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
|
@ -1822,6 +2092,7 @@ static size_t g_scratch_offset = 0;
|
||||||
|
|
||||||
static int g_device_count = -1;
|
static int g_device_count = -1;
|
||||||
static int g_main_device = 0;
|
static int g_main_device = 0;
|
||||||
|
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
|
||||||
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
|
||||||
|
|
||||||
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
||||||
|
@ -1839,9 +2110,12 @@ void ggml_init_cublas() {
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||||
fprintf(stderr, " Device %d: %s\n", id, prop.name);
|
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
||||||
|
|
||||||
g_tensor_split[id] = total_vram;
|
g_tensor_split[id] = total_vram;
|
||||||
total_vram += prop.totalGlobalMem;
|
total_vram += prop.totalGlobalMem;
|
||||||
|
|
||||||
|
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
||||||
}
|
}
|
||||||
for (int id = 0; id < g_device_count; ++id) {
|
for (int id = 0; id < g_device_count; ++id) {
|
||||||
g_tensor_split[id] /= total_vram;
|
g_tensor_split[id] /= total_vram;
|
||||||
|
@ -2057,7 +2331,7 @@ inline void ggml_cuda_op_rms_norm(
|
||||||
(void) i1;
|
(void) i1;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
inline void ggml_cuda_op_mul_mat_vec(
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
|
||||||
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
|
||||||
cudaStream_t & cudaStream_main){
|
cudaStream_t & cudaStream_main){
|
||||||
|
@ -2069,6 +2343,52 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||||
const int64_t ne00 = src0->ne[0];
|
const int64_t ne00 = src0->ne[0];
|
||||||
const int64_t nrows = i01_high - i01_low;
|
const int64_t nrows = i01_high - i01_low;
|
||||||
|
|
||||||
|
#ifdef GGML_CUDA_FORCE_DMMV
|
||||||
|
const bool use_mul_mat_vec_q = false;
|
||||||
|
#else
|
||||||
|
int id;
|
||||||
|
CUDA_CHECK(cudaGetDevice(&id));
|
||||||
|
|
||||||
|
const bool mul_mat_vec_q_implemented = src0->type == GGML_TYPE_Q4_0 ||
|
||||||
|
src0->type == GGML_TYPE_Q4_1 ||
|
||||||
|
src0->type == GGML_TYPE_Q5_0 ||
|
||||||
|
src0->type == GGML_TYPE_Q5_1 ||
|
||||||
|
src0->type == GGML_TYPE_Q8_0;
|
||||||
|
|
||||||
|
// The integer intrinsics used in mul_mat_vec_q are available with compute capability 6.
|
||||||
|
// However, they have bad performance with Pascal cards.
|
||||||
|
// Therefore, in a multi GPU setting decide at runtime which GPUs should use mul_mat_vec_q.
|
||||||
|
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 700 && mul_mat_vec_q_implemented;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (use_mul_mat_vec_q) {
|
||||||
|
size_t as;
|
||||||
|
void * src1_q8_1 = ggml_cuda_pool_malloc(ne00*sizeof(block_q8_1)/QK8_1, &as);
|
||||||
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, cudaStream_main);
|
||||||
|
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_Q4_0:
|
||||||
|
mul_mat_vec_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q4_1:
|
||||||
|
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q5_0:
|
||||||
|
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q5_1:
|
||||||
|
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
|
break;
|
||||||
|
case GGML_TYPE_Q8_0:
|
||||||
|
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_cuda_pool_free(src1_q8_1, as);
|
||||||
|
} else {
|
||||||
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
|
||||||
#ifdef GGML_CUDA_DMMV_F16
|
#ifdef GGML_CUDA_DMMV_F16
|
||||||
size_t ash;
|
size_t ash;
|
||||||
|
@ -2132,6 +2452,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
|
||||||
ggml_cuda_pool_free(src1_dfloat, ash);
|
ggml_cuda_pool_free(src1_dfloat, ash);
|
||||||
}
|
}
|
||||||
#endif // GGML_CUDA_DMMV_F16
|
#endif // GGML_CUDA_DMMV_F16
|
||||||
|
}
|
||||||
|
|
||||||
(void) src1;
|
(void) src1;
|
||||||
(void) dst;
|
(void) dst;
|
||||||
|
@ -2701,8 +3022,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
|
||||||
}else if (src0->type == GGML_TYPE_F32) {
|
}else if (src0->type == GGML_TYPE_F32) {
|
||||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
||||||
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
|
||||||
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
|
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
|
||||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false, false);
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
|
||||||
} else {
|
} else {
|
||||||
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);
|
||||||
}
|
}
|
||||||
|
|
118
ggml.h
118
ggml.h
|
@ -201,6 +201,8 @@
|
||||||
#define GGML_MAX_NAME 48
|
#define GGML_MAX_NAME 48
|
||||||
#define GGML_DEFAULT_N_THREADS 4
|
#define GGML_DEFAULT_N_THREADS 4
|
||||||
|
|
||||||
|
#define GGML_UNUSED(x) (void)(x)
|
||||||
|
|
||||||
#define GGML_ASSERT(x) \
|
#define GGML_ASSERT(x) \
|
||||||
do { \
|
do { \
|
||||||
if (!(x)) { \
|
if (!(x)) { \
|
||||||
|
@ -209,6 +211,30 @@
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
||||||
|
// main purpose is to reduce code duplication and improve readability.
|
||||||
|
//
|
||||||
|
// example:
|
||||||
|
//
|
||||||
|
// GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
||||||
|
// GGML_TENSOR_LOCALS(size_t, nb1, src1, nb);
|
||||||
|
//
|
||||||
|
#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
|
||||||
|
const type prefix##0 = (pointer)->array[0]; \
|
||||||
|
GGML_UNUSED(prefix##0);
|
||||||
|
#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
|
||||||
|
GGML_TENSOR_LOCALS_1 (type, prefix, pointer, array) \
|
||||||
|
const type prefix##1 = (pointer)->array[1]; \
|
||||||
|
GGML_UNUSED(prefix##1);
|
||||||
|
#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
|
||||||
|
GGML_TENSOR_LOCALS_2 (type, prefix, pointer, array) \
|
||||||
|
const type prefix##2 = (pointer)->array[2]; \
|
||||||
|
GGML_UNUSED(prefix##2);
|
||||||
|
#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
|
||||||
|
GGML_TENSOR_LOCALS_3 (type, prefix, pointer, array) \
|
||||||
|
const type prefix##3 = (pointer)->array[3]; \
|
||||||
|
GGML_UNUSED(prefix##3);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -295,12 +321,15 @@ extern "C" {
|
||||||
GGML_OP_SUM,
|
GGML_OP_SUM,
|
||||||
GGML_OP_SUM_ROWS,
|
GGML_OP_SUM_ROWS,
|
||||||
GGML_OP_MEAN,
|
GGML_OP_MEAN,
|
||||||
|
GGML_OP_ARGMAX,
|
||||||
GGML_OP_REPEAT,
|
GGML_OP_REPEAT,
|
||||||
GGML_OP_REPEAT_BACK,
|
GGML_OP_REPEAT_BACK,
|
||||||
GGML_OP_ABS,
|
GGML_OP_ABS,
|
||||||
GGML_OP_SGN,
|
GGML_OP_SGN,
|
||||||
GGML_OP_NEG,
|
GGML_OP_NEG,
|
||||||
GGML_OP_STEP,
|
GGML_OP_STEP,
|
||||||
|
GGML_OP_TANH,
|
||||||
|
GGML_OP_ELU,
|
||||||
GGML_OP_RELU,
|
GGML_OP_RELU,
|
||||||
GGML_OP_GELU,
|
GGML_OP_GELU,
|
||||||
GGML_OP_GELU_QUICK,
|
GGML_OP_GELU_QUICK,
|
||||||
|
@ -332,9 +361,8 @@ extern "C" {
|
||||||
GGML_OP_ROPE_BACK,
|
GGML_OP_ROPE_BACK,
|
||||||
GGML_OP_ALIBI,
|
GGML_OP_ALIBI,
|
||||||
GGML_OP_CLAMP,
|
GGML_OP_CLAMP,
|
||||||
GGML_OP_CONV_1D_S1_PH,
|
GGML_OP_CONV_1D,
|
||||||
GGML_OP_CONV_1D_S2_PH,
|
GGML_OP_CONV_2D,
|
||||||
GGML_OP_CONV_2D_SK_P0,
|
|
||||||
|
|
||||||
GGML_OP_FLASH_ATTN,
|
GGML_OP_FLASH_ATTN,
|
||||||
GGML_OP_FLASH_FF,
|
GGML_OP_FLASH_FF,
|
||||||
|
@ -690,6 +718,11 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
// argmax along rows
|
||||||
|
GGML_API struct ggml_tensor * ggml_argmax(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// if a is the same shape as b, and a is not parameter, return a
|
// if a is the same shape as b, and a is not parameter, return a
|
||||||
// otherwise, return a new tensor: repeat(a) to fit in b
|
// otherwise, return a new tensor: repeat(a) to fit in b
|
||||||
GGML_API struct ggml_tensor * ggml_repeat(
|
GGML_API struct ggml_tensor * ggml_repeat(
|
||||||
|
@ -734,6 +767,22 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_tanh(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_tanh_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_elu(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_elu_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_relu(
|
GGML_API struct ggml_tensor * ggml_relu(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
@ -1084,58 +1133,33 @@ extern "C" {
|
||||||
float min,
|
float min,
|
||||||
float max);
|
float max);
|
||||||
|
|
||||||
// TODO: implement general-purpose convolutions
|
GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||||
// GGML_API struct ggml_tensor * ggml_conv_1d(
|
|
||||||
// struct ggml_context * ctx,
|
|
||||||
// struct ggml_tensor * a,
|
|
||||||
// struct ggml_tensor * b,
|
|
||||||
// int s0
|
|
||||||
// int p0,
|
|
||||||
// int d0);
|
|
||||||
//
|
|
||||||
// GGML_API struct ggml_tensor * ggml_conv_2d(
|
|
||||||
// struct ggml_context * ctx,
|
|
||||||
// struct ggml_tensor * a,
|
|
||||||
// struct ggml_tensor * b,
|
|
||||||
// int s0,
|
|
||||||
// int s1,
|
|
||||||
// int p0,
|
|
||||||
// int p1,
|
|
||||||
// int d0,
|
|
||||||
// int d1);
|
|
||||||
|
|
||||||
// padding = half
|
|
||||||
// TODO: we don't support extra parameters for now
|
|
||||||
// that's why we are hard-coding the stride, padding, and dilation
|
|
||||||
// not great ..
|
|
||||||
// example:
|
|
||||||
// a: 3 80 768 1
|
|
||||||
// b: 3000 80 1 1
|
|
||||||
// res: 3000 768 1 1
|
|
||||||
// used in whisper
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_1d_s1_ph(
|
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b,
|
||||||
|
int s0, // stride
|
||||||
|
int p0, // padding
|
||||||
|
int d0); // dilation
|
||||||
|
|
||||||
// used in whisper
|
GGML_API struct ggml_tensor * ggml_conv_2d(
|
||||||
GGML_API struct ggml_tensor * ggml_conv_1d_s2_ph(
|
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b,
|
||||||
|
int s0,
|
||||||
|
int s1,
|
||||||
|
int p0,
|
||||||
|
int p1,
|
||||||
|
int d0,
|
||||||
|
int d1);
|
||||||
|
|
||||||
// kernel size is a->ne[0] x a->ne[1]
|
// conv_1d with padding = half
|
||||||
// stride is equal to kernel size
|
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
|
||||||
// padding is zero
|
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
|
||||||
// example:
|
|
||||||
// a: 16 16 3 768
|
|
||||||
// b: 1024 1024 3 1
|
|
||||||
// res: 64 64 768 1
|
|
||||||
// used in sam
|
|
||||||
GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
|
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b,
|
||||||
|
int s,
|
||||||
|
int d);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_flash_attn(
|
GGML_API struct ggml_tensor * ggml_flash_attn(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
|
35
llama.cpp
35
llama.cpp
|
@ -1156,6 +1156,7 @@ static void llama_model_load_internal(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
|
|
||||||
|
@ -1164,6 +1165,10 @@ static void llama_model_load_internal(
|
||||||
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
fprintf(stderr, "%s: offloading non-repeating layers to GPU\n", __func__);
|
||||||
}
|
}
|
||||||
size_t vram_kv_cache = 0;
|
size_t vram_kv_cache = 0;
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
const int max_backend_supported_layers = hparams.n_layer + 3;
|
||||||
|
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
||||||
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
if (n_gpu_layers > (int) hparams.n_layer + 1) {
|
||||||
if (low_vram) {
|
if (low_vram) {
|
||||||
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
|
||||||
|
@ -1180,14 +1185,18 @@ static void llama_model_load_internal(
|
||||||
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3;
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
||||||
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
||||||
|
#endif // GGML_USE_CUBLAS
|
||||||
|
|
||||||
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
fprintf(stderr, "%s: offloaded %d/%d layers to GPU\n",
|
||||||
__func__, std::min(n_gpu_layers, max_offloadable_layers), hparams.n_layer + 3);
|
__func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
||||||
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
fprintf(stderr, "%s: total VRAM used: %zu MB\n",
|
||||||
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1) / MB); // round up
|
||||||
#else
|
#else
|
||||||
(void) n_gpu_layers;
|
(void) n_gpu_layers;
|
||||||
#endif
|
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
}
|
}
|
||||||
|
|
||||||
// populate `tensors_by_name`
|
// populate `tensors_by_name`
|
||||||
|
@ -1896,10 +1905,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
|
||||||
|
|
||||||
llama_sample_softmax(ctx, candidates);
|
llama_sample_softmax(ctx, candidates);
|
||||||
|
|
||||||
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
// Compute the cumulative probabilities
|
// Compute the cumulative probabilities
|
||||||
float cum_sum = 0.0f;
|
float cum_sum = 0.0f;
|
||||||
size_t last_idx = candidates->size;
|
size_t last_idx = candidates->size;
|
||||||
|
@ -1928,9 +1937,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
|
||||||
|
|
||||||
llama_sample_softmax(nullptr, candidates);
|
llama_sample_softmax(nullptr, candidates);
|
||||||
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
// Compute the first and second derivatives
|
// Compute the first and second derivatives
|
||||||
std::vector<float> first_derivatives(candidates->size - 1);
|
std::vector<float> first_derivatives(candidates->size - 1);
|
||||||
|
@ -1982,11 +1990,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
|
||||||
|
|
||||||
// Compute the softmax of logits and calculate entropy
|
// Compute the softmax of logits and calculate entropy
|
||||||
llama_sample_softmax(nullptr, candidates);
|
llama_sample_softmax(nullptr, candidates);
|
||||||
|
|
||||||
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
float entropy = 0.0f;
|
float entropy = 0.0f;
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
|
||||||
|
@ -2155,13 +2163,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_
|
||||||
|
|
||||||
if (ctx) {
|
if (ctx) {
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
ctx->n_sample++;
|
|
||||||
}
|
}
|
||||||
return X;
|
return X;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
|
||||||
assert(ctx);
|
|
||||||
int64_t t_start_sample_us;
|
int64_t t_start_sample_us;
|
||||||
t_start_sample_us = ggml_time_us();
|
t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
|
@ -2176,13 +2182,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
|
||||||
candidates->size = 1;
|
candidates->size = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ctx) {
|
||||||
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
||||||
|
}
|
||||||
|
|
||||||
// Normalize the probabilities of the remaining words
|
// Normalize the probabilities of the remaining words
|
||||||
llama_sample_softmax(ctx, candidates);
|
llama_sample_softmax(ctx, candidates);
|
||||||
|
|
||||||
// Sample the next word X from the remaining words
|
// Sample the next word X from the remaining words
|
||||||
if (ctx) {
|
|
||||||
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
||||||
}
|
|
||||||
llama_token X = llama_sample_token(ctx, candidates);
|
llama_token X = llama_sample_token(ctx, candidates);
|
||||||
t_start_sample_us = ggml_time_us();
|
t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,11 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
cp -rpv ../ggml/src/ggml.c ./ggml.c
|
cp -rpv ../ggml/src/ggml.c ./ggml.c
|
||||||
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
|
||||||
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
|
||||||
|
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
|
||||||
|
cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h
|
||||||
|
cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp
|
||||||
|
cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h
|
||||||
|
cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m
|
||||||
|
cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal
|
||||||
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue