Merge branch 'ggerganov:master' into iq2_s

This commit is contained in:
Abhilash Majumder 2024-03-18 19:31:56 +05:30 committed by GitHub
commit 1641c521fe
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
33 changed files with 1821 additions and 1507 deletions

View file

@ -48,6 +48,28 @@ jobs:
CC=gcc-8 make tests -j $(nproc)
make test -j $(nproc)
ubuntu-focal-make-curl:
runs-on: ubuntu-20.04
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
- name: Build
id: make_build
env:
LLAMA_FATAL_WARNINGS: 1
LLAMA_CURL: 1
run: |
CC=gcc-8 make -j $(nproc)
ubuntu-latest-cmake:
runs-on: ubuntu-latest
@ -76,40 +98,40 @@ jobs:
cd build
ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-sanitizer:
runs-on: ubuntu-latest
continue-on-error: true
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install build-essential
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
id: cmake_test
run: |
cd build
ctest -L main --verbose --timeout 900
# ubuntu-latest-cmake-sanitizer:
# runs-on: ubuntu-latest
#
# continue-on-error: true
#
# strategy:
# matrix:
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
# build_type: [Debug, Release]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v3
#
# - name: Dependencies
# id: depends
# run: |
# sudo apt-get update
# sudo apt-get install build-essential
#
# - name: Build
# id: cmake_build
# run: |
# mkdir build
# cd build
# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
# cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
#
# - name: Test
# id: cmake_test
# run: |
# cd build
# ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest

View file

@ -15,8 +15,8 @@ jobs:
days-before-issue-stale: 30
days-before-issue-close: 14
stale-issue-label: "stale"
stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
days-before-pr-stale: -1
days-before-pr-close: -1
operations-per-run: 1000
repo-token: ${{ secrets.GITHUB_TOKEN }}

View file

@ -24,13 +24,13 @@ jobs:
strategy:
matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED]
# TODO: temporary disabled due to linux kernel issues
#sanitizer: [ADDRESS, THREAD, UNDEFINED]
sanitizer: [UNDEFINED]
build_type: [Debug]
include:
- build_type: Release
sanitizer: ""
- build_type: Debug
sanitizer: THREAD
disabled_on_pr: true
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
@ -57,7 +57,8 @@ jobs:
cmake \
python3-pip \
wget \
language-pack-en
language-pack-en \
libcurl4-openssl-dev
- name: Build
id: cmake_build
@ -67,6 +68,7 @@ jobs:
cmake .. \
-DLLAMA_NATIVE=OFF \
-DLLAMA_BUILD_SERVER=ON \
-DLLAMA_CURL=ON \
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
@ -101,12 +103,21 @@ jobs:
with:
fetch-depth: 0
- name: libCURL
id: get_libcurl
env:
CURL_VERSION: 8.6.0_6
run: |
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
mkdir $env:RUNNER_TEMP/libcurl
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
- name: Python setup
@ -120,6 +131,11 @@ jobs:
run: |
pip install -r examples/server/tests/requirements.txt
- name: Copy Libcurl
id: prepare_libcurl
run: |
cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
- name: Tests
id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}

View file

@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)

View file

@ -595,6 +595,11 @@ include scripts/get-flags.mk
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
endif
ifdef LLAMA_CURL
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
override LDFLAGS := $(LDFLAGS) -lcurl
endif
#
# Print build information
#

View file

@ -68,6 +68,17 @@ if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
endif()
set(LLAMA_COMMON_EXTRA_LIBS build_info)
# Use curl to download model url
if (LLAMA_CURL)
find_package(CURL REQUIRED)
add_definitions(-DLLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
endif ()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)

File diff suppressed because it is too large Load diff

View file

@ -89,6 +89,7 @@ struct gpt_params {
struct llama_sampling_params sparams;
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
std::string model_url = ""; // model url to download
std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
std::string prompt = "";
@ -191,6 +192,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
struct llama_model_params params);
// Batch utils
void llama_batch_clear(struct llama_batch & batch);

View file

@ -1634,7 +1634,7 @@ in chat mode so that the conversation can end normally.")
self.post_write_tensors(tensor_map, name, data_torch)
@Model.register("BertModel")
@Model.register("BertModel", "CamembertModel")
class BertModel(Model):
model_arch = gguf.MODEL_ARCH.BERT

View file

@ -1167,9 +1167,9 @@ class OutputFile:
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
return GGMLFileType.AllF32
if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
return GGMLFileType.MostlyF16
if output_type_str == "q8_0":
return GGMLFileType.MostlyQ8_0

View file

@ -56,13 +56,31 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];
std::string wname;
{
// remove any prefix and suffixes from the name
// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
const char * p = strchr(src0->name, '#');
if (p != NULL) {
p = p + 1;
const char * q = strchr(p, '#');
if (q != NULL) {
wname = std::string(p, q - p);
} else {
wname = p;
}
} else {
wname = src0->name;
}
}
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
if (ask) {
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
if (t->op != GGML_OP_MUL_MAT) return false;
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
return true;
}
@ -94,12 +112,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
// this is necessary to guarantee equal number of "ncall" for each tensor
for (int ex = 0; ex < n_as; ++ex) {
src0 = t->src[2 + ex];
auto& e = m_stats[src0->name];
auto& e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false);
}
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
@ -107,7 +125,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
++e.ncall;
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const int excur = m_ids[row*n_as + idx];
@ -129,17 +147,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
}
}
} else {
auto& e = m_stats[src0->name];
auto& e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
exit(1); //GGML_ASSERT(false);
}
++e.ncall;
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
for (int row = 0; row < (int)src1->ne[1]; ++row) {
const float * x = data + row * src1->ne[0];

View file

@ -114,10 +114,10 @@ static std::string get_cpu_info() {
static std::string get_gpu_info() {
std::string id;
#ifdef GGML_USE_CUBLAS
int count = ggml_cuda_get_device_count();
int count = ggml_backend_cuda_get_device_count();
for (int i = 0; i < count; i++) {
char buf[128];
ggml_cuda_get_device_description(i, buf, sizeof(buf));
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
id += buf;
if (i < count - 1) {
id += "/";

View file

@ -67,6 +67,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.

View file

@ -20,6 +20,7 @@ The project is under active development, and we are [looking for feedback and co
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.

View file

@ -2195,6 +2195,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
}
printf(" -m FNAME, --model FNAME\n");
printf(" model path (default: %s)\n", params.model.c_str());
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
printf(" model download url (default: %s)\n", params.model_url.c_str());
printf(" -a ALIAS, --alias ALIAS\n");
printf(" set an alias for the model, will be added as `model` field in completion response\n");
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
@ -2317,6 +2319,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
break;
}
params.model = argv[i];
} else if (arg == "-mu" || arg == "--model-url") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model_url = argv[i];
} else if (arg == "-a" || arg == "--alias") {
if (++i >= argc) {
invalid_param = true;

View file

@ -57,7 +57,7 @@ Feature or Scenario must be annotated with `@llama.cpp` to be included in the de
To run a scenario annotated with `@bug`, start:
```shell
DEBUG=ON ./tests.sh --no-skipped --tags bug
DEBUG=ON ./tests.sh --no-skipped --tags bug --stop
```
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.

View file

@ -4,7 +4,8 @@ Feature: llama.cpp server
Background: Server startup
Given a server listening on localhost:8080
And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
And a model file ggml-model-f16.gguf
And a model alias bert-bge-small
And 42 as server seed
And 2 slots

View file

@ -1,10 +1,12 @@
import errno
import os
import socket
import subprocess
import time
from contextlib import closing
import signal
import socket
import sys
import time
import traceback
from contextlib import closing
import psutil
def before_scenario(context, scenario):
@ -20,33 +22,40 @@ def before_scenario(context, scenario):
def after_scenario(context, scenario):
if context.server_process is None:
return
if scenario.status == "failed":
if 'GITHUB_ACTIONS' in os.environ:
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
if os.path.isfile('llama.log'):
with closing(open('llama.log', 'r')) as f:
for line in f:
print(line)
if not is_server_listening(context.server_fqdn, context.server_port):
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
try:
if 'server_process' not in context or context.server_process is None:
return
if scenario.status == "failed":
if 'GITHUB_ACTIONS' in os.environ:
print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
if os.path.isfile('llama.log'):
with closing(open('llama.log', 'r')) as f:
for line in f:
print(line)
if not is_server_listening(context.server_fqdn, context.server_port):
print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
if not pid_exists(context.server_process.pid):
assert False, f"Server not running pid={context.server_process.pid} ..."
if not pid_exists(context.server_process.pid):
assert False, f"Server not running pid={context.server_process.pid} ..."
server_graceful_shutdown(context)
server_graceful_shutdown(context)
# Wait few for socket to free up
time.sleep(0.05)
# Wait few for socket to free up
time.sleep(0.05)
attempts = 0
while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
server_kill(context)
time.sleep(0.1)
attempts += 1
if attempts > 5:
server_kill_hard(context)
attempts = 0
while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
server_kill(context)
time.sleep(0.1)
attempts += 1
if attempts > 5:
server_kill_hard(context)
except:
exc = sys.exception()
print("error in after scenario: \n")
print(exc)
print("*** print_tb: \n")
traceback.print_tb(exc.__traceback__, file=sys.stdout)
def server_graceful_shutdown(context):
@ -67,11 +76,11 @@ def server_kill_hard(context):
path = context.server_path
print(f"Server dangling exits, hard killing force {pid}={path}...\n")
if os.name == 'nt':
process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode()
print(process)
else:
os.kill(-pid, signal.SIGKILL)
try:
psutil.Process(pid).kill()
except psutil.NoSuchProcess:
return False
return True
def is_server_listening(server_fqdn, server_port):
@ -84,17 +93,9 @@ def is_server_listening(server_fqdn, server_port):
def pid_exists(pid):
"""Check whether pid exists in the current process table."""
if pid < 0:
try:
psutil.Process(pid)
except psutil.NoSuchProcess:
return False
if os.name == 'nt':
output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode()
print(output)
return "No tasks are running" not in output
else:
try:
os.kill(pid, 0)
except OSError as e:
return e.errno == errno.EPERM
else:
return True
return True

View file

@ -4,7 +4,8 @@ Feature: llama.cpp server
Background: Server startup
Given a server listening on localhost:8080
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
And a model file stories260K.gguf
And a model alias tinyllama-2
And 42 as server seed
# KV Cache corresponds to the total amount of tokens

View file

@ -5,6 +5,8 @@ import os
import re
import socket
import subprocess
import sys
import threading
import time
from contextlib import closing
from re import RegexFlag
@ -32,6 +34,8 @@ def step_server_config(context, server_fqdn, server_port):
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
context.model_alias = None
context.model_file = None
context.model_url = None
context.n_batch = None
context.n_ubatch = None
context.n_ctx = None
@ -65,6 +69,16 @@ def step_download_hf_model(context, hf_file, hf_repo):
print(f"model file: {context.model_file}\n")
@step('a model file {model_file}')
def step_model_file(context, model_file):
context.model_file = model_file
@step('a model url {model_url}')
def step_model_url(context, model_url):
context.model_url = model_url
@step('a model alias {model_alias}')
def step_model_alias(context, model_alias):
context.model_alias = model_alias
@ -141,7 +155,8 @@ def step_start_server(context):
async def step_wait_for_the_server_to_be_started(context, expecting_status):
match expecting_status:
case 'healthy':
await wait_for_health_status(context, context.base_url, 200, 'ok')
await wait_for_health_status(context, context.base_url, 200, 'ok',
timeout=30)
case 'ready' | 'idle':
await wait_for_health_status(context, context.base_url, 200, 'ok',
@ -1038,8 +1053,11 @@ def start_server_background(context):
server_args = [
'--host', server_listen_addr,
'--port', context.server_port,
'--model', context.model_file
]
if context.model_file:
server_args.extend(['--model', context.model_file])
if context.model_url:
server_args.extend(['--model-url', context.model_url])
if context.n_batch:
server_args.extend(['--batch-size', context.n_batch])
if context.n_ubatch:
@ -1079,8 +1097,23 @@ def start_server_background(context):
pkwargs = {
'creationflags': flags,
'stdout': subprocess.PIPE,
'stderr': subprocess.PIPE
}
context.server_process = subprocess.Popen(
[str(arg) for arg in [context.server_path, *server_args]],
**pkwargs)
def log_stdout(process):
for line in iter(process.stdout.readline, b''):
print(line.decode('utf-8'), end='')
thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
thread_stdout.start()
def log_stderr(process):
for line in iter(process.stderr.readline, b''):
print(line.decode('utf-8'), end='', file=sys.stderr)
thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
thread_stderr.start()
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")

View file

@ -3,4 +3,5 @@ behave~=1.2.6
huggingface_hub~=0.20.3
numpy~=1.24.4
openai~=0.25.0
psutil~=5.9.8
prometheus-client~=0.20.0

View file

@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view(node)) {
// TODO: better way to add external dependencies
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
// itself is never used and should not be considered a dependency
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
struct ggml_tensor * view_src = node->view_src;
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
}
@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
// allocate explicit inputs and leafs
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
// allocate explicit inputs
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
}
}

View file

@ -103,6 +103,11 @@ extern "C" {
// check if the backend supports an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
void (*GGML_CALL event_free) (ggml_backend_event_t event);

View file

@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
return err;
}
bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return backend->iface.graph_compute(backend, cgraph);
}
@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
return backend->iface.supports_op(backend, op);
}
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
if (backend->iface.offload_op != NULL) {
return backend->iface.offload_op(backend, op);
}
return false;
}
// backend copy
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
if (cpu_plan->cplan.work_size > 0) {
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
if (cpu_plan->cplan.work_data == NULL) {
free(cpu_plan);
return NULL;
}
}
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = {
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .supports_op = */ ggml_backend_cpu_supports_op,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,
@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
#endif
#ifndef GGML_SCHED_MAX_SPLITS
#define GGML_SCHED_MAX_SPLITS 256
#define GGML_SCHED_MAX_SPLITS 2048
#endif
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
#define GGML_SCHED_MAX_SPLIT_INPUTS 4
#endif
#ifndef GGML_SCHED_MAX_COPIES
@ -1043,8 +1055,9 @@ struct ggml_backend_sched {
struct ggml_cgraph * graph;
// graph splits
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
struct ggml_backend_sched_split * splits;
int n_splits;
int splits_capacity;
// pipeline parallelism support
int n_copies;
@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
// TODO: use supports_op to check if the backend supports the op
// assign pre-allocated nodes to their backend
// dst
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
if (cur_backend != -1) {
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.dst");
return cur_backend;
return cur_backend_id;
}
// view_src
if (tensor->view_src != NULL) {
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
if (cur_backend != -1) {
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.vsrc");
return cur_backend;
return cur_backend_id;
}
}
// input
// graph input
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
SET_CAUSE(tensor, "1.inp");
return cur_backend;
return cur_backend_id;
}
// assign nodes that use weights to the backend of the weights
// operations with weights are preferably run on the same backend as the weights
for (int i = 0; i < GGML_MAX_SRC; i++) {
const struct ggml_tensor * src = tensor->src[i];
if (src == NULL) {
continue;
}
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
// operations with weights are always run on the same backend as the weights
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
// check if a backend with higher prio wants to offload the op
if (src_backend_id == sched->n_backends - 1) {
for (int b = 0; b < src_backend_id; b++) {
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off");
return b;
}
}
}
SET_CAUSE(tensor, "1.wgt%d", i);
return src_backend;
return src_backend_id;
}
}
@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// pass 1: assign backends to ops with pre-allocated inputs
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
if (tensor_backend_id(leaf) != -1) {
int * leaf_backend_id = &tensor_backend_id(leaf);
if (*leaf_backend_id != -1) {
// do not overwrite user assignments
continue;
}
tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
}
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (tensor_backend_id(node) != -1) {
int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) {
// do not overwrite user assignments
continue;
}
tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
// src
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
if (tensor_backend_id(src) == -1) {
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
int * src_backend_id = &tensor_backend_id(src);
if (*src_backend_id == -1) {
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
}
}
}
@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (ggml_is_view_op(node->op)) {
continue;
}
int tensor_backend_id = tensor_backend_id(node);
if (tensor_backend_id != -1) {
if (tensor_backend_id == sched->n_backends - 1) {
int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) {
if (*node_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend)
cur_backend_id = -1;
} else {
cur_backend_id = tensor_backend_id;
cur_backend_id = *node_backend_id;
}
} else {
tensor_backend_id(node) = cur_backend_id;
*node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.2");
}
}
}
// pass 2.1 expand gpu up
{
int cur_backend_id = -1;
@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (ggml_is_view_op(node->op)) {
continue;
}
int tensor_backend_id = tensor_backend_id(node);
if (tensor_backend_id != -1) {
if (tensor_backend_id == sched->n_backends - 1) {
int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) {
if (*node_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend)
cur_backend_id = -1;
} else {
cur_backend_id = tensor_backend_id;
cur_backend_id = *node_backend_id;
}
} else {
tensor_backend_id(node) = cur_backend_id;
*node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.1");
}
}
}
// pass 2.4 expand rest down
{
int cur_backend_id = -1;
@ -1317,16 +1338,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (ggml_is_view_op(node->op)) {
continue;
}
int tensor_backend_id = tensor_backend_id(node);
if (tensor_backend_id != -1) {
cur_backend_id = tensor_backend_id;
int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) {
cur_backend_id = *node_backend_id;
} else {
tensor_backend_id(node) = cur_backend_id;
*node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.4");
}
}
}
// pass 2.3 expand rest up
// pass 2.3 expand rest up
{
int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (ggml_is_view_op(node->op)) {
continue;
}
int tensor_backend_id = tensor_backend_id(node);
if (tensor_backend_id != -1) {
cur_backend_id = tensor_backend_id;
int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) {
cur_backend_id = *node_backend_id;
} else {
tensor_backend_id(node) = cur_backend_id;
*node_backend_id = cur_backend_id;
SET_CAUSE(node, "2.3");
}
}
@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// pass 3: assign backends to remaining src from dst and view_src
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
int cur_backend_id = tensor_backend_id(node);
if (node->view_src != NULL && cur_backend_id == -1) {
cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
int * cur_backend_id = &tensor_backend_id(node);
if (node->view_src != NULL && *cur_backend_id == -1) {
*cur_backend_id = tensor_backend_id(node->view_src);
SET_CAUSE(node, "3.vsrc");
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (src == NULL) {
continue;
}
int src_backend_id = tensor_backend_id(src);
if (src_backend_id == -1) {
int * src_backend_id = &tensor_backend_id(src);
if (*src_backend_id == -1) {
if (src->view_src != NULL) {
// views are always on the same backend as the source
tensor_backend_id(src) = tensor_backend_id(src->view_src);
*src_backend_id = tensor_backend_id(src->view_src);
SET_CAUSE(src, "3.vsrc");
} else {
tensor_backend_id(src) = cur_backend_id;
*src_backend_id = *cur_backend_id;
SET_CAUSE(src, "3.cur");
}
}
@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
// pass 4: split graph, find tensors that need to be copied
{
int cur_split = 0;
int i_split = 0;
struct ggml_backend_sched_split * split = &sched->splits[0];
// find the backend of the first split, skipping view ops
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (!ggml_is_view_op(node->op)) {
sched->splits[0].backend_id = tensor_backend_id(node);
split->backend_id = tensor_backend_id(node);
break;
}
}
sched->splits[0].i_start = 0;
sched->splits[0].n_inputs = 0;
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
int cur_backend_id = sched->splits[0].backend_id;
split->i_start = 0;
split->n_inputs = 0;
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
int cur_backend_id = split->backend_id;
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
continue;
}
int tensor_backend_id = tensor_backend_id(node);
const int node_backend_id = tensor_backend_id(node);
GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
if (tensor_backend_id != cur_backend_id) {
sched->splits[cur_split].i_end = i;
cur_split++;
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
sched->splits[cur_split].backend_id = tensor_backend_id;
sched->splits[cur_split].i_start = i;
sched->splits[cur_split].n_inputs = 0;
cur_backend_id = tensor_backend_id;
// check if we should start a new split based on the sources of the current node
bool need_new_split = false;
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
// check if a weight is on a different backend
// by starting a new split, the memory of the previously offloaded weights can be reused
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = tensor_backend_id(src);
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
need_new_split = true;
break;
}
}
// check if the split has too many inputs
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
const size_t id = hash_id(src);
int src_backend_id = sched->tensor_backend_id[id];
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
need_new_split = true;
break;
}
}
}
}
if (node_backend_id != cur_backend_id || need_new_split) {
split->i_end = i;
i_split++;
if (i_split >= sched->splits_capacity) {
sched->splits_capacity *= 2;
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
GGML_ASSERT(sched->splits != NULL);
}
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
split = &sched->splits[i_split];
split->backend_id = node_backend_id;
split->i_start = i;
split->n_inputs = 0;
cur_backend_id = node_backend_id;
}
// find inputs that are not on the same backend
@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
continue;
}
int src_backend_id = tensor_backend_id(src);
const int src_backend_id = tensor_backend_id(src);
assert(src_backend_id != -1); // all inputs should be assigned by now
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
size_t id = hash_id(src);
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[src_backend_id];
@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
tensor_backend_id(tensor_copy) = src_backend_id;
SET_CAUSE(tensor_copy, "4.cpy");
}
int n_graph_inputs = sched->n_graph_inputs++;
@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
}
if (src_backend_id != tensor_backend_id) {
if (src_backend_id != node_backend_id) {
// create a copy of the input in the split's backend
size_t id = hash_id(src);
const size_t id = hash_id(src);
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[cur_backend_id];
for (int c = 0; c < sched->n_copies; c++) {
@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
}
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
tensor_backend_id(tensor_copy) = cur_backend_id;
SET_CAUSE(tensor_copy, "4.cpy");
}
int n_inputs = sched->splits[cur_split].n_inputs++;
int n_inputs = split->n_inputs++;
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = src;
split->inputs[n_inputs] = src;
}
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
}
}
}
sched->splits[cur_split].i_end = graph->n_nodes;
sched->n_splits = cur_split + 1;
split->i_end = graph->n_nodes;
sched->n_splits = i_split + 1;
}
#ifdef DEBUG_PASS4
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
#endif
#ifndef NDEBUG
// sanity check: all sources should have the same backend as the node
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
if (tensor_backend == NULL) {
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
}
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
if (src_backend != tensor_backend /* && src_backend != NULL */) {
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
}
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
}
}
}
fflush(stderr);
#endif
// create copies of the graph for each split
// TODO: avoid this copy
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &sched->splits[i];
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
for (int j = 0; j < split->n_inputs; j++) {
assert(graph_copy->size > (graph_copy->n_nodes + 1));
struct ggml_tensor * input = split->inputs[j];
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
const size_t input_id = hash_id(input);
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
// add a dependency to the input source so that it is not freed before the copy is done
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
input_dep->src[0] = input;
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
// add a dependency to the input copy so that it is allocated at the start of the split
@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
}
for (int j = split->i_start; j < split->i_end; j++) {
assert(graph_copy->size > graph_copy->n_nodes);
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
}
@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
}
ggml_backend_tensor_copy(input, input_cpy);
} else {
// wait for the split backend to finish using the input before overwriting it
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
} else {
ggml_backend_synchronize(split_backend);
ggml_backend_synchronize(input_backend);
}
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
}
}
@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
// initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
sched->hash_set = ggml_hash_set_new(graph_size);
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
sched->n_backends = n_backends;
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
const int initial_splits_capacity = 16;
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
sched->splits_capacity = initial_splits_capacity;
for (int b = 0; b < n_backends; b++) {
sched->backends[b] = backends[b];
@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
}
ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx);
free(sched->splits);
free(sched->hash_set.keys);
free(sched->tensor_backend_id);
free(sched->tensor_copies);
@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
}
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
ggml_backend_sched_split_graph(sched, measure_graph);
// TODO: extract this to a separate function
@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
}
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
ggml_backend_sched_split_graph(sched, graph);

View file

@ -70,11 +70,11 @@ extern "C" {
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);

View file

@ -82,6 +82,10 @@
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaHostRegister hipHostRegister
#define cudaHostRegisterPortable hipHostRegisterPortable
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
#define cudaHostUnregister hipHostUnregister
#define cudaLaunchHostFunc hipLaunchHostFunc
#ifdef GGML_HIP_UMA
#define cudaMalloc hipMallocManaged
@ -7787,11 +7791,7 @@ struct cuda_pool_alloc {
static bool g_cublas_loaded = false;
GGML_CALL bool ggml_cublas_loaded(void) {
return g_cublas_loaded;
}
GGML_CALL void ggml_init_cublas() {
static void ggml_init_cublas() {
static bool initialized = false;
if (!initialized) {
@ -7880,7 +7880,7 @@ GGML_CALL void ggml_init_cublas() {
}
}
GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
static void * ggml_cuda_host_malloc(size_t size) {
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
return nullptr;
}
@ -7890,7 +7890,7 @@ GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
if (err != cudaSuccess) {
// clear the error
cudaGetLastError();
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
size/1024.0/1024.0, cudaGetErrorString(err));
return nullptr;
}
@ -7898,7 +7898,7 @@ GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
return ptr;
}
GGML_CALL void ggml_cuda_host_free(void * ptr) {
static void ggml_cuda_host_free(void * ptr) {
CUDA_CHECK(cudaFreeHost(ptr));
}
@ -9036,21 +9036,13 @@ static void ggml_cuda_op_soft_max(
// positions tensor
float * src2_dd = nullptr;
cuda_pool_alloc<float> src2_f;
ggml_tensor * src2 = dst->src[2];
const bool use_src2 = src2 != nullptr;
if (use_src2) {
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
if (src2_on_device) {
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
src2_dd = (float *) src2_extra->data_device[g_main_device];
} else {
src2_dd = src2_f.alloc(ggml_nelements(src2));
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
}
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
src2_dd = (float *) src2_extra->data_device[g_main_device];
}
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
@ -9107,55 +9099,24 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
// dd = data device
float * src0_ddf = nullptr;
float * src1_ddf = nullptr;
float * dst_ddf = nullptr;
cuda_pool_alloc<float> src0_f;
cuda_pool_alloc<float> src1_f;
cuda_pool_alloc<float> dst_f;
ggml_cuda_set_device(g_main_device);
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
if (src0_on_device) {
src0_ddf = (float *) src0_extra->data_device[g_main_device];
} else {
src0_ddf = src0_f.alloc(ggml_nelements(src0));
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
}
src0_ddf = (float *) src0_extra->data_device[g_main_device];
if (use_src1) {
if (src1_on_device) {
src1_ddf = (float *) src1_extra->data_device[g_main_device];
} else {
src1_ddf = src1_f.alloc(ggml_nelements(src1));
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
}
}
if (dst_on_device) {
dst_ddf = (float *) dst_extra->data_device[g_main_device];
} else {
dst_ddf = dst_f.alloc(ggml_nelements(dst));
src1_ddf = (float *) src1_extra->data_device[g_main_device];
}
dst_ddf = (float *) dst_extra->data_device[g_main_device];
// do the computation
op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
CUDA_CHECK(cudaGetLastError());
// copy dst to host if necessary
if (!dst_on_device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
}
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
CUDA_CHECK(cudaDeviceSynchronize());
}
}
static void ggml_cuda_set_peer_access(const int n_tokens) {
@ -9251,7 +9212,6 @@ static void ggml_cuda_op_mul_mat(
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
const bool src0_is_contiguous = ggml_is_contiguous(src0);
const bool src1_is_contiguous = ggml_is_contiguous(src1);
@ -9322,13 +9282,13 @@ static void ggml_cuda_op_mul_mat(
used_devices++;
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
const bool src1_on_device = id == g_main_device; // TODO: check from buffer
const bool dst_on_device = id == g_main_device;
ggml_cuda_set_device(id);
cudaStream_t stream = g_cudaStreams[id][0];
if (src0_on_device && src0_is_contiguous) {
if (src0_is_contiguous) {
dev[id].src0_dd = (char *) src0_extra->data_device[id];
} else {
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0));
@ -9374,8 +9334,8 @@ static void ggml_cuda_op_mul_mat(
continue;
}
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
const bool src1_on_device = id == g_main_device; // TODO: check from buffer
const bool dst_on_device = id == g_main_device;
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
ggml_cuda_set_device(id);
@ -9400,12 +9360,12 @@ static void ggml_cuda_op_mul_mat(
// the main device memory buffer can be on VRAM scratch, with space for all partial results
// in that case an offset on dst_ddf_i is needed
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) {
if (id == g_main_device) {
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
}
// copy src0, src1 to device if necessary
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
if (src1_is_contiguous) {
if (id != g_main_device) {
if (convert_src1_to_q8_1) {
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
@ -9418,19 +9378,19 @@ static void ggml_cuda_op_mul_mat(
src1_ncols*ne10*sizeof(float), stream));
}
}
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
} else if (src1_on_device && !src1_is_contiguous) {
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
} else {
GGML_ASSERT(false);
}
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
CUDA_CHECK(cudaGetLastError());
}
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
}
@ -9441,17 +9401,7 @@ static void ggml_cuda_op_mul_mat(
// copy dst to host or other device if necessary
if (!dst_on_device) {
void * dst_off_device;
cudaMemcpyKind kind;
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
dst_off_device = dst->data;
kind = cudaMemcpyDeviceToHost;
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
dst_off_device = dst_extra->data_device[g_main_device];
kind = cudaMemcpyDeviceToDevice;
} else {
GGML_ASSERT(false);
}
void * dst_off_device = dst_extra->data_device[g_main_device];
if (split) {
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
// dst is NOT transposed.
@ -9462,28 +9412,26 @@ static void ggml_cuda_op_mul_mat(
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
#if !defined(GGML_USE_HIPBLAS)
if (kind == cudaMemcpyDeviceToDevice) {
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
cudaMemcpy3DPeerParms p = {};
p.dstDevice = g_main_device;
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
p.srcDevice = id;
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
} else
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
cudaMemcpy3DPeerParms p = {};
p.dstDevice = g_main_device;
p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols);
p.srcDevice = id;
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
#else
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
dst_dd_i, row_diff*sizeof(float),
row_diff*sizeof(float), src1_ncols,
cudaMemcpyDeviceToDevice, stream));
#endif
{
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
dst_dd_i, row_diff*sizeof(float),
row_diff*sizeof(float), src1_ncols,
kind, stream));
}
} else {
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
dhf_dst_i += src1_col_0*ne0;
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream));
}
}
@ -9510,11 +9458,6 @@ static void ggml_cuda_op_mul_mat(
}
}
}
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
ggml_cuda_set_device(g_main_device);
CUDA_CHECK(cudaDeviceSynchronize());
}
}
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -9599,36 +9542,19 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
// dd = data device
float * src0_ddf = nullptr;
float * src1_ddf = nullptr;
float * dst_ddf = nullptr;
cuda_pool_alloc<float> dst_f;
ggml_cuda_set_device(g_main_device);
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
if (dst_on_device) {
dst_ddf = (float *) dst_extra->data_device[g_main_device];
} else {
dst_ddf = dst_f.alloc(ggml_nelements(dst));
}
dst_ddf = (float *) dst_extra->data_device[g_main_device];
// do the computation
ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
CUDA_CHECK(cudaGetLastError());
// copy dst to host if necessary
if (!dst_on_device) {
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
}
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
CUDA_CHECK(cudaDeviceSynchronize());
}
}
static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -9639,21 +9565,6 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
}
GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
if (!g_cublas_loaded) return false;
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
// TODO: find the optimal values for these
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 &&
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
}
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
@ -9891,11 +9802,6 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
}
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool all_on_device =
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
(src1->backend == GGML_BACKEND_TYPE_GPU) &&
( dst->backend == GGML_BACKEND_TYPE_GPU);
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
int64_t min_compute_capability = INT_MAX;
@ -9972,13 +9878,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// KQ single-batch
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
} else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
} else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// KQ + KQV multi-batch
ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
} else if (use_dequantize_mul_mat_vec) {
@ -10178,6 +10084,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
ggml_cuda_mul_mat_id_cublas(dst);
// TODO: mmq/mmv support
#endif
cudaStream_t stream = g_cudaStreams[g_main_device][0];
const size_t nb11 = src1->nb[1];
const size_t nb1 = dst->nb[1];
@ -10187,16 +10094,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
const int32_t n_as = ((int32_t *) dst->op_params)[1];
std::vector<char> ids_host(ggml_nbytes(ids));
cudaStream_t stream = g_cudaStreams[g_main_device][0];
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
} else {
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
}
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
CUDA_CHECK(cudaStreamSynchronize(stream));
const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
@ -10213,20 +10113,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
src1_row.extra = &src1_row_extra;
dst_row.extra = &dst_row_extra;
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
char * src1_original = (char *) src1_extra->data_device[g_main_device];
char * dst_original = (char *) dst_extra->data_device[g_main_device];
if (src1->ne[1] == 1) {
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
//int32_t row_id;
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
GGML_ASSERT(row_id >= 0 && row_id < n_as);
@ -10248,11 +10139,6 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ?
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ?
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
@ -10267,7 +10153,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
GGML_ASSERT(row_id >= 0 && row_id < n_as);
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
nb11, src1_kind, stream));
nb11, cudaMemcpyDeviceToDevice, stream));
num_src1_rows++;
}
@ -10299,15 +10185,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
GGML_ASSERT(row_id >= 0 && row_id < n_as);
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
nb1, dst_kind, stream));
nb1, cudaMemcpyDeviceToDevice, stream));
num_src1_rows++;
}
}
}
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
CUDA_CHECK(cudaStreamSynchronize(stream));
}
}
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -10435,7 +10317,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
}
GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
static void ggml_cuda_set_main_device(const int main_device) {
if (main_device >= g_device_count) {
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
main_device, g_device_count, g_main_device);
@ -10450,18 +10332,9 @@ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
}
}
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
static bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) {
if (!g_cublas_loaded) return false;
ggml_cuda_func_t func;
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
return false;
}
if (tensor->op == GGML_OP_MUL_MAT) {
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
#ifndef NDEBUG
@ -10471,6 +10344,8 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
}
}
ggml_cuda_func_t func;
switch (tensor->op) {
case GGML_OP_REPEAT:
func = ggml_cuda_repeat;
@ -10548,15 +10423,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
func = ggml_cuda_rms_norm;
break;
case GGML_OP_MUL_MAT:
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
return false;
}
func = ggml_cuda_mul_mat;
break;
case GGML_OP_MUL_MAT_ID:
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
return false;
}
func = ggml_cuda_mul_mat_id;
break;
case GGML_OP_SCALE:
@ -10613,17 +10482,11 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
}
if (params->ith != 0) {
return true;
}
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return true;
}
func(tensor->src[0], tensor->src[1], tensor);
return true;
}
GGML_CALL int ggml_cuda_get_device_count() {
static int ggml_cuda_get_device_count() {
int device_count;
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
return 0;
@ -10631,7 +10494,7 @@ GGML_CALL int ggml_cuda_get_device_count() {
return device_count;
}
GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
static void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
snprintf(description, description_size, "%s", prop.name);
@ -10736,6 +10599,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
if (padded_size > original_size && tensor->view_src == nullptr) {
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
}
}
@ -10873,6 +10737,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
};
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
ggml_init_cublas();
// FIXME: this is not thread safe
if (device >= ggml_backend_cuda_get_device_count()) {
return nullptr;
@ -11157,6 +11023,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
};
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
ggml_init_cublas();
// FIXME: this is not thread safe
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
@ -11348,9 +11216,6 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
ggml_cuda_set_main_device(cuda_ctx->device);
ggml_compute_params params = {};
params.type = GGML_TASK_TYPE_COMPUTE;
params.ith = 0;
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
@ -11372,7 +11237,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
}
#endif
bool ok = ggml_cuda_compute_forward(&params, node);
bool ok = ggml_cuda_compute_forward(node);
if (!ok) {
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
}
@ -11509,6 +11374,14 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
UNUSED(backend);
}
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
const int min_batch_size = 32;
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
UNUSED(backend);
}
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
@ -11571,6 +11444,7 @@ static ggml_backend_i ggml_backend_cuda_interface = {
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .supports_op = */ ggml_backend_cuda_supports_op,
/* .offload_op = */ ggml_backend_cuda_offload_op,
/* .event_new = */ ggml_backend_cuda_event_new,
/* .event_free = */ ggml_backend_cuda_event_free,
/* .event_record = */ ggml_backend_cuda_event_record,
@ -11584,7 +11458,7 @@ static ggml_guid_t ggml_backend_cuda_guid() {
}
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
ggml_init_cublas(); // TODO: remove from ggml.c
ggml_init_cublas();
if (device < 0 || device >= ggml_cuda_get_device_count()) {
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
@ -11627,6 +11501,31 @@ GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, si
CUDA_CHECK(cudaMemGetInfo(free, total));
}
GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
return false;
}
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
if (err != cudaSuccess) {
// clear the error
cudaGetLastError();
fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
size/1024.0/1024.0, cudaGetErrorString(err));
return false;
}
return true;
}
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
cudaError_t err = cudaHostUnregister(buffer);
if (err != cudaSuccess) {
// clear the error
cudaGetLastError();
}
}
// backend registry
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);

View file

@ -17,29 +17,17 @@ extern "C" {
#define GGML_CUDA_MAX_DEVICES 16
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
GGML_API GGML_CALL void ggml_init_cublas(void);
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
// device buffer
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
@ -47,6 +35,9 @@ GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
#ifdef __cplusplus
}
#endif

View file

@ -1951,6 +1951,7 @@ static struct ggml_backend_i kompute_backend_i = {
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
/* .supports_op = */ ggml_backend_kompute_supports_op,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,

View file

@ -2837,6 +2837,7 @@ static struct ggml_backend_i ggml_backend_metal_i = {
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_metal_graph_compute,
/* .supports_op = */ ggml_backend_metal_supports_op,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,

View file

@ -17584,6 +17584,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
/* .supports_op = */ ggml_backend_sycl_supports_op,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,

View file

@ -710,6 +710,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
}
}
// All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
// Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
if (compute_index >= 0) {
return compute_index;
}
std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
for(auto &q_family : queue_family_props) {
@ -5693,6 +5699,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_vk_graph_compute,
/* .supports_op = */ ggml_backend_vk_supports_op,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,

19
ggml.c
View file

@ -282,8 +282,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#else
#include <cblas.h>
#endif
#elif defined(GGML_USE_CUBLAS)
#include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
@ -2640,9 +2638,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
}
#if defined(GGML_USE_CUBLAS)
ggml_init_cublas();
#elif defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_CLBLAST)
ggml_cl_init();
#elif defined(GGML_USE_VULKAN)
ggml_vk_init_cpu_assist();
@ -11105,7 +11101,6 @@ static void ggml_compute_forward_out_prod_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
// TODO: #if defined(GGML_USE_CLBLAST)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
@ -11305,7 +11300,6 @@ static void ggml_compute_forward_out_prod_q_f32(
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
if (params->type == GGML_TASK_TYPE_INIT) {
@ -16051,14 +16045,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
return;
}
#ifdef GGML_USE_CUBLAS
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
if (skip_cpu) {
return;
}
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
#elif defined(GGML_USE_VULKAN)
#if defined(GGML_USE_VULKAN)
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
if (skip_cpu) {
@ -16070,7 +16057,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
}
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
#endif // GGML_USE_CUBLAS
#endif // GGML_USE_VULKAN
#ifdef GGML_USE_SYCL
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);

View file

@ -2040,6 +2040,11 @@ struct llama_model {
ggml_free(ctx);
}
for (ggml_backend_buffer_t buf : bufs) {
#ifdef GGML_USE_CUBLAS
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
}
#endif
ggml_backend_buffer_free(buf);
}
}
@ -5033,6 +5038,13 @@ static bool llm_load_tensors(
size_t first, last;
ml.get_mapping_range(&first, &last, ctx);
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
#ifdef GGML_USE_CUBLAS
if (n_layer >= n_gpu_layers) {
ggml_backend_cuda_register_host_buffer(
ggml_backend_buffer_get_base(buf),
ggml_backend_buffer_get_size(buf));
}
#endif
}
#ifdef GGML_USE_METAL
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
@ -8231,7 +8243,6 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
cb(cur, "kqv_out", il);
}
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
@ -8601,12 +8612,15 @@ static struct ggml_cgraph * llama_build_graph(
}
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// to fix this, we assign the norm layer manually to the backend of its layer
if (il != -1 && strcmp(name, "norm") == 0) {
for (auto * backend : lctx.backends) {
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
break;
// FIXME: fix in ggml_backend_sched
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
if (batch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
for (auto * backend : lctx.backends) {
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend);
break;
}
}
}
}
@ -13107,27 +13121,25 @@ struct llama_context * llama_new_context_with_model(
ctx->backends.push_back(ctx->backend_metal);
}
#elif defined(GGML_USE_CUBLAS)
if (model->n_gpu_layers > 0) {
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
ggml_backend_t backend = ggml_backend_cuda_init(device);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
} else {
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
ggml_backend_t backend = ggml_backend_cuda_init(device);
if (backend == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
llama_free(ctx);
return nullptr;
}
ctx->backends.push_back(backend);
}
}
}
#elif defined(GGML_USE_VULKAN)
@ -13285,14 +13297,17 @@ struct llama_context * llama_new_context_with_model(
ggml_backend_t backend = ctx->backends[i];
ggml_backend_buffer_type_t buft = backend_buft[i];
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
ggml_backend_buft_name(buft),
size / 1024.0 / 1024.0);
if (size > 1) {
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
ggml_backend_buft_name(buft),
size / 1024.0 / 1024.0);
}
}
// note: the number of splits during measure is higher than during inference due to the kv shift
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
}
}