Merge branch 'master' into mpi-heterogenous
This commit is contained in:
commit
1d744d8226
49 changed files with 3052 additions and 1453 deletions
90
.github/workflows/build.yml
vendored
90
.github/workflows/build.yml
vendored
|
@ -48,6 +48,28 @@ jobs:
|
||||||
CC=gcc-8 make tests -j $(nproc)
|
CC=gcc-8 make tests -j $(nproc)
|
||||||
make test -j $(nproc)
|
make test -j $(nproc)
|
||||||
|
|
||||||
|
ubuntu-focal-make-curl:
|
||||||
|
runs-on: ubuntu-20.04
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Clone
|
||||||
|
id: checkout
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Dependencies
|
||||||
|
id: depends
|
||||||
|
run: |
|
||||||
|
sudo apt-get update
|
||||||
|
sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
id: make_build
|
||||||
|
env:
|
||||||
|
LLAMA_FATAL_WARNINGS: 1
|
||||||
|
LLAMA_CURL: 1
|
||||||
|
run: |
|
||||||
|
CC=gcc-8 make -j $(nproc)
|
||||||
|
|
||||||
ubuntu-latest-cmake:
|
ubuntu-latest-cmake:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
@ -76,40 +98,40 @@ jobs:
|
||||||
cd build
|
cd build
|
||||||
ctest -L main --verbose --timeout 900
|
ctest -L main --verbose --timeout 900
|
||||||
|
|
||||||
ubuntu-latest-cmake-sanitizer:
|
# ubuntu-latest-cmake-sanitizer:
|
||||||
runs-on: ubuntu-latest
|
# runs-on: ubuntu-latest
|
||||||
|
#
|
||||||
continue-on-error: true
|
# continue-on-error: true
|
||||||
|
#
|
||||||
strategy:
|
# strategy:
|
||||||
matrix:
|
# matrix:
|
||||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||||
build_type: [Debug, Release]
|
# build_type: [Debug, Release]
|
||||||
|
#
|
||||||
steps:
|
# steps:
|
||||||
- name: Clone
|
# - name: Clone
|
||||||
id: checkout
|
# id: checkout
|
||||||
uses: actions/checkout@v3
|
# uses: actions/checkout@v3
|
||||||
|
#
|
||||||
- name: Dependencies
|
# - name: Dependencies
|
||||||
id: depends
|
# id: depends
|
||||||
run: |
|
# run: |
|
||||||
sudo apt-get update
|
# sudo apt-get update
|
||||||
sudo apt-get install build-essential
|
# sudo apt-get install build-essential
|
||||||
|
#
|
||||||
- name: Build
|
# - name: Build
|
||||||
id: cmake_build
|
# id: cmake_build
|
||||||
run: |
|
# run: |
|
||||||
mkdir build
|
# mkdir build
|
||||||
cd build
|
# cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
|
||||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
# cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
#
|
||||||
- name: Test
|
# - name: Test
|
||||||
id: cmake_test
|
# id: cmake_test
|
||||||
run: |
|
# run: |
|
||||||
cd build
|
# cd build
|
||||||
ctest -L main --verbose --timeout 900
|
# ctest -L main --verbose --timeout 900
|
||||||
|
|
||||||
ubuntu-latest-cmake-mpi:
|
ubuntu-latest-cmake-mpi:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
22
.github/workflows/close-issue.yml
vendored
Normal file
22
.github/workflows/close-issue.yml
vendored
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
name: Close inactive issues
|
||||||
|
on:
|
||||||
|
schedule:
|
||||||
|
- cron: "42 0 * * *"
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
close-issues:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
permissions:
|
||||||
|
issues: write
|
||||||
|
pull-requests: write
|
||||||
|
steps:
|
||||||
|
- uses: actions/stale@v5
|
||||||
|
with:
|
||||||
|
days-before-issue-stale: 30
|
||||||
|
days-before-issue-close: 14
|
||||||
|
stale-issue-label: "stale"
|
||||||
|
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
|
||||||
|
days-before-pr-stale: -1
|
||||||
|
days-before-pr-close: -1
|
||||||
|
operations-per-run: 1000
|
||||||
|
repo-token: ${{ secrets.GITHUB_TOKEN }}
|
26
.github/workflows/server.yml
vendored
26
.github/workflows/server.yml
vendored
|
@ -24,13 +24,13 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
# TODO: temporary disabled due to linux kernel issues
|
||||||
|
#sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||||
|
sanitizer: [UNDEFINED]
|
||||||
build_type: [Debug]
|
build_type: [Debug]
|
||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
sanitizer: ""
|
sanitizer: ""
|
||||||
- build_type: Debug
|
|
||||||
sanitizer: THREAD
|
|
||||||
disabled_on_pr: true
|
disabled_on_pr: true
|
||||||
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
|
||||||
|
|
||||||
|
@ -57,7 +57,8 @@ jobs:
|
||||||
cmake \
|
cmake \
|
||||||
python3-pip \
|
python3-pip \
|
||||||
wget \
|
wget \
|
||||||
language-pack-en
|
language-pack-en \
|
||||||
|
libcurl4-openssl-dev
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
|
@ -67,6 +68,7 @@ jobs:
|
||||||
cmake .. \
|
cmake .. \
|
||||||
-DLLAMA_NATIVE=OFF \
|
-DLLAMA_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
|
-DLLAMA_CURL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
|
||||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
|
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
|
||||||
|
@ -101,12 +103,21 @@ jobs:
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
|
- name: libCURL
|
||||||
|
id: get_libcurl
|
||||||
|
env:
|
||||||
|
CURL_VERSION: 8.6.0_6
|
||||||
|
run: |
|
||||||
|
curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
|
||||||
|
mkdir $env:RUNNER_TEMP/libcurl
|
||||||
|
tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
|
cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
|
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
|
||||||
|
|
||||||
- name: Python setup
|
- name: Python setup
|
||||||
|
@ -120,6 +131,11 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
pip install -r examples/server/tests/requirements.txt
|
pip install -r examples/server/tests/requirements.txt
|
||||||
|
|
||||||
|
- name: Copy Libcurl
|
||||||
|
id: prepare_libcurl
|
||||||
|
run: |
|
||||||
|
cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
|
||||||
|
|
||||||
- name: Tests
|
- name: Tests
|
||||||
id: server_integration_tests
|
id: server_integration_tests
|
||||||
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
|
||||||
|
|
|
@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some
|
||||||
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
|
||||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||||
"llama: max. batch size for using peer access")
|
"llama: max. batch size for using peer access")
|
||||||
|
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||||
|
|
7
Makefile
7
Makefile
|
@ -553,7 +553,7 @@ endif
|
||||||
endif # LLAMA_METAL
|
endif # LLAMA_METAL
|
||||||
|
|
||||||
ifdef LLAMA_METAL
|
ifdef LLAMA_METAL
|
||||||
ggml-metal.o: ggml-metal.m ggml-metal.h
|
ggml-metal.o: ggml-metal.m ggml-metal.h ggml.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ifdef LLAMA_METAL_EMBED_LIBRARY
|
ifdef LLAMA_METAL_EMBED_LIBRARY
|
||||||
|
@ -595,6 +595,11 @@ include scripts/get-flags.mk
|
||||||
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifdef LLAMA_CURL
|
||||||
|
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
|
||||||
|
override LDFLAGS := $(LDFLAGS) -lcurl
|
||||||
|
endif
|
||||||
|
|
||||||
#
|
#
|
||||||
# Print build information
|
# Print build information
|
||||||
#
|
#
|
||||||
|
|
|
@ -112,6 +112,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
|
||||||
- [x] [Gemma](https://ai.google.dev/gemma)
|
- [x] [Gemma](https://ai.google.dev/gemma)
|
||||||
- [x] [Mamba](https://github.com/state-spaces/mamba)
|
- [x] [Mamba](https://github.com/state-spaces/mamba)
|
||||||
|
- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
|
||||||
|
|
||||||
**Multimodal models:**
|
**Multimodal models:**
|
||||||
|
|
||||||
|
@ -133,6 +134,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||||
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
||||||
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
|
- JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm)
|
||||||
|
- Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama)
|
||||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||||
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||||
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
||||||
|
|
|
@ -68,6 +68,17 @@ if (BUILD_SHARED_LIBS)
|
||||||
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(LLAMA_COMMON_EXTRA_LIBS build_info)
|
||||||
|
|
||||||
|
# Use curl to download model url
|
||||||
|
if (LLAMA_CURL)
|
||||||
|
find_package(CURL REQUIRED)
|
||||||
|
add_definitions(-DLLAMA_USE_CURL)
|
||||||
|
include_directories(${CURL_INCLUDE_DIRS})
|
||||||
|
find_library(CURL_LIBRARY curl REQUIRED)
|
||||||
|
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
|
||||||
|
endif ()
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
||||||
target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
|
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
|
||||||
|
|
1201
common/common.cpp
1201
common/common.cpp
File diff suppressed because it is too large
Load diff
|
@ -37,10 +37,13 @@ extern char const *LLAMA_COMMIT;
|
||||||
extern char const *LLAMA_COMPILER;
|
extern char const *LLAMA_COMPILER;
|
||||||
extern char const *LLAMA_BUILD_TARGET;
|
extern char const *LLAMA_BUILD_TARGET;
|
||||||
|
|
||||||
|
struct llama_control_vector_load_info;
|
||||||
|
|
||||||
|
int32_t get_num_physical_cores();
|
||||||
|
|
||||||
//
|
//
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
//
|
//
|
||||||
int32_t get_num_physical_cores();
|
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
@ -86,6 +89,7 @@ struct gpt_params {
|
||||||
struct llama_sampling_params sparams;
|
struct llama_sampling_params sparams;
|
||||||
|
|
||||||
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
std::string model = "models/7B/ggml-model-f16.gguf"; // model path
|
||||||
|
std::string model_url = ""; // model url to download
|
||||||
std::string model_draft = ""; // draft model for speculative decoding
|
std::string model_draft = ""; // draft model for speculative decoding
|
||||||
std::string model_alias = "unknown"; // model alias
|
std::string model_alias = "unknown"; // model alias
|
||||||
std::string prompt = "";
|
std::string prompt = "";
|
||||||
|
@ -103,6 +107,11 @@ struct gpt_params {
|
||||||
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
|
||||||
std::string lora_base = ""; // base model path for the lora adapter
|
std::string lora_base = ""; // base model path for the lora adapter
|
||||||
|
|
||||||
|
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||||
|
|
||||||
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||||
|
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||||
|
|
||||||
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||||
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||||
// (which is more convenient to use for plotting)
|
// (which is more convenient to use for plotting)
|
||||||
|
@ -183,6 +192,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
|
||||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
||||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
|
||||||
|
|
||||||
|
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
|
||||||
|
struct llama_model_params params);
|
||||||
|
|
||||||
// Batch utils
|
// Batch utils
|
||||||
|
|
||||||
void llama_batch_clear(struct llama_batch & batch);
|
void llama_batch_clear(struct llama_batch & batch);
|
||||||
|
@ -269,3 +281,24 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
|
||||||
void llama_embd_normalize(const float * inp, float * out, int n);
|
void llama_embd_normalize(const float * inp, float * out, int n);
|
||||||
|
|
||||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Control vector utils
|
||||||
|
//
|
||||||
|
|
||||||
|
struct llama_control_vector_data {
|
||||||
|
int n_embd;
|
||||||
|
|
||||||
|
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
||||||
|
std::vector<float> data;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_control_vector_load_info {
|
||||||
|
float strength;
|
||||||
|
|
||||||
|
std::string fname;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Load control vectors, scale each by strength, and add them together.
|
||||||
|
// On error, returns {-1, empty}
|
||||||
|
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
|
||||||
|
|
|
@ -1634,7 +1634,7 @@ in chat mode so that the conversation can end normally.")
|
||||||
self.post_write_tensors(tensor_map, name, data_torch)
|
self.post_write_tensors(tensor_map, name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("BertModel")
|
@Model.register("BertModel", "CamembertModel")
|
||||||
class BertModel(Model):
|
class BertModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.BERT
|
model_arch = gguf.MODEL_ARCH.BERT
|
||||||
|
|
||||||
|
@ -1965,6 +1965,23 @@ class MambaModel(Model):
|
||||||
self.gguf_writer.add_tensor(new_name, data)
|
self.gguf_writer.add_tensor(new_name, data)
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("CohereForCausalLM")
|
||||||
|
class CommandR2Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.COMMAND_R
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
# max_position_embeddings = 8192 in config.json but model was actually
|
||||||
|
# trained on 128k context length
|
||||||
|
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_logit_scale(self.hparams["logit_scale"])
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1167,9 +1167,9 @@ class OutputFile:
|
||||||
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType:
|
||||||
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
|
wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type
|
||||||
|
|
||||||
if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32):
|
if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)):
|
||||||
return GGMLFileType.AllF32
|
return GGMLFileType.AllF32
|
||||||
if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)):
|
if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16):
|
||||||
return GGMLFileType.MostlyF16
|
return GGMLFileType.MostlyF16
|
||||||
if output_type_str == "q8_0":
|
if output_type_str == "q8_0":
|
||||||
return GGMLFileType.MostlyQ8_0
|
return GGMLFileType.MostlyQ8_0
|
||||||
|
|
62
examples/gritlm/README.md
Normal file
62
examples/gritlm/README.md
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
## Generative Representational Instruction Tuning (GRIT) Example
|
||||||
|
[gritlm] a model which can generate embeddings as well as "normal" text
|
||||||
|
generation depending on the instructions in the prompt.
|
||||||
|
|
||||||
|
* Paper: https://arxiv.org/pdf/2402.09906.pdf
|
||||||
|
|
||||||
|
### Retrieval-Augmented Generation (RAG) use case
|
||||||
|
One use case for `gritlm` is to use it with RAG. If we recall how RAG works is
|
||||||
|
that we take documents that we want to use as context, to ground the large
|
||||||
|
language model (LLM), and we create token embeddings for them. We then store
|
||||||
|
these token embeddings in a vector database.
|
||||||
|
|
||||||
|
When we perform a query, prompt the LLM, we will first create token embeddings
|
||||||
|
for the query and then search the vector database to retrieve the most
|
||||||
|
similar vectors, and return those documents so they can be passed to the LLM as
|
||||||
|
context. Then the query and the context will be passed to the LLM which will
|
||||||
|
have to _again_ create token embeddings for the query. But because gritlm is used
|
||||||
|
the first query can be cached and the second query tokenization generation does
|
||||||
|
not have to be performed at all.
|
||||||
|
|
||||||
|
### Running the example
|
||||||
|
Download a Grit model:
|
||||||
|
```console
|
||||||
|
$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
Run the example using the downloaded model:
|
||||||
|
```console
|
||||||
|
$ ./gritlm -m gritlm-7b_q4_1.gguf
|
||||||
|
|
||||||
|
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605
|
||||||
|
Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103
|
||||||
|
Cosine similarity between "Generative Representational Instruction Tuning" and "A purely peer-to-peer version of electronic cash w" is: 0.112
|
||||||
|
Cosine similarity between "Generative Representational Instruction Tuning" and "All text-based language problems can be reduced to" is: 0.547
|
||||||
|
|
||||||
|
Oh, brave adventurer, who dared to climb
|
||||||
|
The lofty peak of Mt. Fuji in the night,
|
||||||
|
When shadows lurk and ghosts do roam,
|
||||||
|
And darkness reigns, a fearsome sight.
|
||||||
|
|
||||||
|
Thou didst set out, with heart aglow,
|
||||||
|
To conquer this mountain, so high,
|
||||||
|
And reach the summit, where the stars do glow,
|
||||||
|
And the moon shines bright, up in the sky.
|
||||||
|
|
||||||
|
Through the mist and fog, thou didst press on,
|
||||||
|
With steadfast courage, and a steadfast will,
|
||||||
|
Through the darkness, thou didst not be gone,
|
||||||
|
But didst climb on, with a steadfast skill.
|
||||||
|
|
||||||
|
At last, thou didst reach the summit's crest,
|
||||||
|
And gazed upon the world below,
|
||||||
|
And saw the beauty of the night's best,
|
||||||
|
And felt the peace, that only nature knows.
|
||||||
|
|
||||||
|
Oh, brave adventurer, who dared to climb
|
||||||
|
The lofty peak of Mt. Fuji in the night,
|
||||||
|
Thou art a hero, in the eyes of all,
|
||||||
|
For thou didst conquer this mountain, so bright.
|
||||||
|
```
|
||||||
|
|
||||||
|
[gritlm]: https://github.com/ContextualAI/gritlm
|
|
@ -56,13 +56,31 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
const struct ggml_tensor * src0 = t->src[0];
|
const struct ggml_tensor * src0 = t->src[0];
|
||||||
const struct ggml_tensor * src1 = t->src[1];
|
const struct ggml_tensor * src1 = t->src[1];
|
||||||
|
|
||||||
|
std::string wname;
|
||||||
|
{
|
||||||
|
// remove any prefix and suffixes from the name
|
||||||
|
// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
|
||||||
|
const char * p = strchr(src0->name, '#');
|
||||||
|
if (p != NULL) {
|
||||||
|
p = p + 1;
|
||||||
|
const char * q = strchr(p, '#');
|
||||||
|
if (q != NULL) {
|
||||||
|
wname = std::string(p, q - p);
|
||||||
|
} else {
|
||||||
|
wname = p;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
wname = src0->name;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||||
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
|
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
|
||||||
if (ask) {
|
if (ask) {
|
||||||
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
|
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
|
||||||
if (t->op != GGML_OP_MUL_MAT) return false;
|
if (t->op != GGML_OP_MUL_MAT) return false;
|
||||||
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
|
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
|
||||||
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
|
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -94,12 +112,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
// this is necessary to guarantee equal number of "ncall" for each tensor
|
// this is necessary to guarantee equal number of "ncall" for each tensor
|
||||||
for (int ex = 0; ex < n_as; ++ex) {
|
for (int ex = 0; ex < n_as; ++ex) {
|
||||||
src0 = t->src[2 + ex];
|
src0 = t->src[2 + ex];
|
||||||
auto& e = m_stats[src0->name];
|
auto& e = m_stats[wname];
|
||||||
if (e.values.empty()) {
|
if (e.values.empty()) {
|
||||||
e.values.resize(src1->ne[0], 0);
|
e.values.resize(src1->ne[0], 0);
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||||
exit(1); //GGML_ASSERT(false);
|
exit(1); //GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
||||||
|
@ -107,7 +125,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
||||||
++e.ncall;
|
++e.ncall;
|
||||||
if (m_params.verbosity > 1) {
|
if (m_params.verbosity > 1) {
|
||||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||||
}
|
}
|
||||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||||
const int excur = m_ids[row*n_as + idx];
|
const int excur = m_ids[row*n_as + idx];
|
||||||
|
@ -129,17 +147,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto& e = m_stats[src0->name];
|
auto& e = m_stats[wname];
|
||||||
if (e.values.empty()) {
|
if (e.values.empty()) {
|
||||||
e.values.resize(src1->ne[0], 0);
|
e.values.resize(src1->ne[0], 0);
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||||
exit(1); //GGML_ASSERT(false);
|
exit(1); //GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
++e.ncall;
|
++e.ncall;
|
||||||
if (m_params.verbosity > 1) {
|
if (m_params.verbosity > 1) {
|
||||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||||
}
|
}
|
||||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||||
const float * x = data + row * src1->ne[0];
|
const float * x = data + row * src1->ne[0];
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
#include <cstdlib>
|
||||||
#include <iterator>
|
#include <iterator>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
|
@ -113,10 +114,10 @@ static std::string get_cpu_info() {
|
||||||
static std::string get_gpu_info() {
|
static std::string get_gpu_info() {
|
||||||
std::string id;
|
std::string id;
|
||||||
#ifdef GGML_USE_CUBLAS
|
#ifdef GGML_USE_CUBLAS
|
||||||
int count = ggml_cuda_get_device_count();
|
int count = ggml_backend_cuda_get_device_count();
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
char buf[128];
|
char buf[128];
|
||||||
ggml_cuda_get_device_description(i, buf, sizeof(buf));
|
ggml_backend_cuda_get_device_description(i, buf, sizeof(buf));
|
||||||
id += buf;
|
id += buf;
|
||||||
if (i < count - 1) {
|
if (i < count - 1) {
|
||||||
id += "/";
|
id += "/";
|
||||||
|
@ -1123,15 +1124,19 @@ struct sql_printer : public printer {
|
||||||
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
|
static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) {
|
||||||
llama_set_n_threads(ctx, n_threads, n_threads);
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
||||||
|
|
||||||
//std::vector<llama_token> tokens(n_prompt, llama_token_bos(llama_get_model(ctx)));
|
const llama_model * model = llama_get_model(ctx);
|
||||||
//llama_decode(ctx, llama_batch_get_one(tokens.data(), n_prompt, n_past, 0));
|
const int32_t n_vocab = llama_n_vocab(model);
|
||||||
//GGML_UNUSED(n_batch);
|
|
||||||
|
std::vector<llama_token> tokens(n_batch);
|
||||||
|
|
||||||
std::vector<llama_token> tokens(n_batch, llama_token_bos(llama_get_model(ctx)));
|
|
||||||
int n_processed = 0;
|
int n_processed = 0;
|
||||||
|
|
||||||
while (n_processed < n_prompt) {
|
while (n_processed < n_prompt) {
|
||||||
int n_tokens = std::min(n_prompt - n_processed, n_batch);
|
int n_tokens = std::min(n_prompt - n_processed, n_batch);
|
||||||
|
tokens[0] = n_processed == 0 && llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
|
||||||
|
for (int i = 1; i < n_tokens; i++) {
|
||||||
|
tokens[i] = std::rand() % n_vocab;
|
||||||
|
}
|
||||||
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
|
llama_decode(ctx, llama_batch_get_one(tokens.data(), n_tokens, n_past + n_processed, 0));
|
||||||
n_processed += n_tokens;
|
n_processed += n_tokens;
|
||||||
}
|
}
|
||||||
|
@ -1142,11 +1147,15 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat
|
||||||
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
|
static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) {
|
||||||
llama_set_n_threads(ctx, n_threads, n_threads);
|
llama_set_n_threads(ctx, n_threads, n_threads);
|
||||||
|
|
||||||
llama_token token = llama_token_bos(llama_get_model(ctx));
|
const llama_model * model = llama_get_model(ctx);
|
||||||
|
const int32_t n_vocab = llama_n_vocab(model);
|
||||||
|
|
||||||
|
llama_token token = llama_add_bos_token(model) ? llama_token_bos(model) : std::rand() % n_vocab;
|
||||||
|
|
||||||
for (int i = 0; i < n_gen; i++) {
|
for (int i = 0; i < n_gen; i++) {
|
||||||
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
|
llama_decode(ctx, llama_batch_get_one(&token, 1, n_past + i, 0));
|
||||||
llama_synchronize(ctx);
|
llama_synchronize(ctx);
|
||||||
|
token = std::rand() % n_vocab;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -497,7 +497,6 @@ struct clip_ctx {
|
||||||
|
|
||||||
// memory buffers to evaluate the model
|
// memory buffers to evaluate the model
|
||||||
ggml_backend_buffer_t params_buffer = NULL;
|
ggml_backend_buffer_t params_buffer = NULL;
|
||||||
ggml_backend_buffer_t compute_buffer = NULL;
|
|
||||||
|
|
||||||
ggml_backend_t backend = NULL;
|
ggml_backend_t backend = NULL;
|
||||||
ggml_gallocr_t compute_alloc = NULL;
|
ggml_gallocr_t compute_alloc = NULL;
|
||||||
|
@ -1235,16 +1234,16 @@ struct clip_image_f32 * clip_image_f32_init() {
|
||||||
|
|
||||||
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
|
void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
|
||||||
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
|
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
|
||||||
void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) {
|
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) {
|
||||||
if (batch.size > 0) {
|
if (batch->size > 0) {
|
||||||
delete[] batch.data;
|
delete[] batch->data;
|
||||||
batch.size = 0;
|
batch->size = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) {
|
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
|
||||||
if (batch.size > 0) {
|
if (batch->size > 0) {
|
||||||
delete[] batch.data;
|
delete[] batch->data;
|
||||||
batch.size = 0;
|
batch->size = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1497,7 +1496,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im
|
||||||
|
|
||||||
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
||||||
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
||||||
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
|
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
|
||||||
bool pad_to_square = true;
|
bool pad_to_square = true;
|
||||||
if (!ctx->has_vision_encoder) {
|
if (!ctx->has_vision_encoder) {
|
||||||
printf("This gguf file seems to have no vision encoder\n");
|
printf("This gguf file seems to have no vision encoder\n");
|
||||||
|
@ -1509,11 +1508,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||||
pad_to_square = false;
|
pad_to_square = false;
|
||||||
}
|
}
|
||||||
// free the previous res_imgs if any set
|
// free the previous res_imgs if any set
|
||||||
if (res_imgs.size > 0) {
|
if (res_imgs->size > 0) {
|
||||||
clip_image_f32_batch_free(res_imgs);
|
clip_image_f32_batch_free(res_imgs);
|
||||||
}
|
}
|
||||||
res_imgs.data = nullptr;
|
res_imgs->data = nullptr;
|
||||||
res_imgs.size = 0;
|
res_imgs->size = 0;
|
||||||
|
|
||||||
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
||||||
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
|
||||||
|
@ -1568,11 +1567,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||||
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
|
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
|
||||||
patches.insert(patches.begin(), image_original_resize);
|
patches.insert(patches.begin(), image_original_resize);
|
||||||
// clip_image_f32_batch_init(patches.size());
|
// clip_image_f32_batch_init(patches.size());
|
||||||
res_imgs.size = patches.size();
|
res_imgs->size = patches.size();
|
||||||
res_imgs.data = new clip_image_f32[res_imgs.size];
|
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||||
int num=0;
|
int num=0;
|
||||||
for (auto& patch : patches) {
|
for (auto& patch : patches) {
|
||||||
normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
|
normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
|
||||||
num++;
|
num++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1660,9 +1659,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
|
||||||
// }
|
// }
|
||||||
// res_imgs.push_back(res);
|
// res_imgs.push_back(res);
|
||||||
|
|
||||||
res_imgs.size = 1;
|
res_imgs->size = 1;
|
||||||
res_imgs.data = new clip_image_f32[res_imgs.size];
|
res_imgs->data = new clip_image_f32[res_imgs->size];
|
||||||
res_imgs.data[0] = *res;
|
res_imgs->data[0] = *res;
|
||||||
clip_image_f32_free(res);
|
clip_image_f32_free(res);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
@ -1676,6 +1675,9 @@ void clip_free(clip_ctx * ctx) {
|
||||||
ggml_free(ctx->ctx_data);
|
ggml_free(ctx->ctx_data);
|
||||||
gguf_free(ctx->ctx_gguf);
|
gguf_free(ctx->ctx_gguf);
|
||||||
|
|
||||||
|
ggml_backend_buffer_free(ctx->params_buffer);
|
||||||
|
ggml_backend_free(ctx->backend);
|
||||||
|
ggml_gallocr_free(ctx->compute_alloc);
|
||||||
delete ctx;
|
delete ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -60,8 +60,8 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();
|
||||||
|
|
||||||
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
|
||||||
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
|
||||||
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch);
|
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
|
||||||
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
|
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
|
||||||
|
|
||||||
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
|
||||||
|
|
||||||
|
@ -69,7 +69,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
|
||||||
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
|
||||||
|
|
||||||
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
|
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
|
||||||
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
|
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
|
||||||
|
|
||||||
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
|
||||||
|
|
||||||
|
|
|
@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
|
||||||
clip_image_f32_batch img_res_v;
|
clip_image_f32_batch img_res_v;
|
||||||
img_res_v.size = 0;
|
img_res_v.size = 0;
|
||||||
img_res_v.data = nullptr;
|
img_res_v.data = nullptr;
|
||||||
if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
|
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
|
||||||
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
|
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
|
||||||
delete[] img_res_v.data;
|
delete[] img_res_v.data;
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -29,9 +29,9 @@ struct llava_image_embed {
|
||||||
};
|
};
|
||||||
|
|
||||||
/** sanity check for clip <-> llava embed size match */
|
/** sanity check for clip <-> llava embed size match */
|
||||||
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
|
LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);
|
||||||
|
|
||||||
LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
|
||||||
|
|
||||||
/** build an image embed from image file bytes */
|
/** build an image embed from image file bytes */
|
||||||
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
|
||||||
|
|
|
@ -67,6 +67,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
|
||||||
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
|
||||||
|
|
||||||
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
|
||||||
|
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
|
||||||
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
|
||||||
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
|
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
|
||||||
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
|
||||||
|
|
|
@ -20,6 +20,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||||
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
|
- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
|
||||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||||
|
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
|
||||||
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
|
|
|
@ -2195,6 +2195,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
|
||||||
}
|
}
|
||||||
printf(" -m FNAME, --model FNAME\n");
|
printf(" -m FNAME, --model FNAME\n");
|
||||||
printf(" model path (default: %s)\n", params.model.c_str());
|
printf(" model path (default: %s)\n", params.model.c_str());
|
||||||
|
printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
|
||||||
|
printf(" model download url (default: %s)\n", params.model_url.c_str());
|
||||||
printf(" -a ALIAS, --alias ALIAS\n");
|
printf(" -a ALIAS, --alias ALIAS\n");
|
||||||
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
printf(" set an alias for the model, will be added as `model` field in completion response\n");
|
||||||
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
|
||||||
|
@ -2317,6 +2319,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.model = argv[i];
|
params.model = argv[i];
|
||||||
|
} else if (arg == "-mu" || arg == "--model-url") {
|
||||||
|
if (++i >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
params.model_url = argv[i];
|
||||||
} else if (arg == "-a" || arg == "--alias") {
|
} else if (arg == "-a" || arg == "--alias") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
|
|
@ -57,7 +57,7 @@ Feature or Scenario must be annotated with `@llama.cpp` to be included in the de
|
||||||
To run a scenario annotated with `@bug`, start:
|
To run a scenario annotated with `@bug`, start:
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
DEBUG=ON ./tests.sh --no-skipped --tags bug
|
DEBUG=ON ./tests.sh --no-skipped --tags bug --stop
|
||||||
```
|
```
|
||||||
|
|
||||||
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
|
After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
|
||||||
|
|
|
@ -4,7 +4,8 @@ Feature: llama.cpp server
|
||||||
|
|
||||||
Background: Server startup
|
Background: Server startup
|
||||||
Given a server listening on localhost:8080
|
Given a server listening on localhost:8080
|
||||||
And a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
|
And a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
|
||||||
|
And a model file ggml-model-f16.gguf
|
||||||
And a model alias bert-bge-small
|
And a model alias bert-bge-small
|
||||||
And 42 as server seed
|
And 42 as server seed
|
||||||
And 2 slots
|
And 2 slots
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
import errno
|
|
||||||
import os
|
import os
|
||||||
import socket
|
|
||||||
import subprocess
|
|
||||||
import time
|
|
||||||
from contextlib import closing
|
|
||||||
import signal
|
import signal
|
||||||
|
import socket
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from contextlib import closing
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
|
||||||
def before_scenario(context, scenario):
|
def before_scenario(context, scenario):
|
||||||
|
@ -20,7 +22,8 @@ def before_scenario(context, scenario):
|
||||||
|
|
||||||
|
|
||||||
def after_scenario(context, scenario):
|
def after_scenario(context, scenario):
|
||||||
if context.server_process is None:
|
try:
|
||||||
|
if 'server_process' not in context or context.server_process is None:
|
||||||
return
|
return
|
||||||
if scenario.status == "failed":
|
if scenario.status == "failed":
|
||||||
if 'GITHUB_ACTIONS' in os.environ:
|
if 'GITHUB_ACTIONS' in os.environ:
|
||||||
|
@ -47,6 +50,12 @@ def after_scenario(context, scenario):
|
||||||
attempts += 1
|
attempts += 1
|
||||||
if attempts > 5:
|
if attempts > 5:
|
||||||
server_kill_hard(context)
|
server_kill_hard(context)
|
||||||
|
except:
|
||||||
|
exc = sys.exception()
|
||||||
|
print("error in after scenario: \n")
|
||||||
|
print(exc)
|
||||||
|
print("*** print_tb: \n")
|
||||||
|
traceback.print_tb(exc.__traceback__, file=sys.stdout)
|
||||||
|
|
||||||
|
|
||||||
def server_graceful_shutdown(context):
|
def server_graceful_shutdown(context):
|
||||||
|
@ -67,11 +76,11 @@ def server_kill_hard(context):
|
||||||
path = context.server_path
|
path = context.server_path
|
||||||
|
|
||||||
print(f"Server dangling exits, hard killing force {pid}={path}...\n")
|
print(f"Server dangling exits, hard killing force {pid}={path}...\n")
|
||||||
if os.name == 'nt':
|
try:
|
||||||
process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode()
|
psutil.Process(pid).kill()
|
||||||
print(process)
|
except psutil.NoSuchProcess:
|
||||||
else:
|
return False
|
||||||
os.kill(-pid, signal.SIGKILL)
|
return True
|
||||||
|
|
||||||
|
|
||||||
def is_server_listening(server_fqdn, server_port):
|
def is_server_listening(server_fqdn, server_port):
|
||||||
|
@ -84,17 +93,9 @@ def is_server_listening(server_fqdn, server_port):
|
||||||
|
|
||||||
|
|
||||||
def pid_exists(pid):
|
def pid_exists(pid):
|
||||||
"""Check whether pid exists in the current process table."""
|
|
||||||
if pid < 0:
|
|
||||||
return False
|
|
||||||
if os.name == 'nt':
|
|
||||||
output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode()
|
|
||||||
print(output)
|
|
||||||
return "No tasks are running" not in output
|
|
||||||
else:
|
|
||||||
try:
|
try:
|
||||||
os.kill(pid, 0)
|
psutil.Process(pid)
|
||||||
except OSError as e:
|
except psutil.NoSuchProcess:
|
||||||
return e.errno == errno.EPERM
|
return False
|
||||||
else:
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,8 @@ Feature: llama.cpp server
|
||||||
|
|
||||||
Background: Server startup
|
Background: Server startup
|
||||||
Given a server listening on localhost:8080
|
Given a server listening on localhost:8080
|
||||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
And a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
|
||||||
|
And a model file stories260K.gguf
|
||||||
And a model alias tinyllama-2
|
And a model alias tinyllama-2
|
||||||
And 42 as server seed
|
And 42 as server seed
|
||||||
# KV Cache corresponds to the total amount of tokens
|
# KV Cache corresponds to the total amount of tokens
|
||||||
|
|
|
@ -5,6 +5,8 @@ import os
|
||||||
import re
|
import re
|
||||||
import socket
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
import time
|
import time
|
||||||
from contextlib import closing
|
from contextlib import closing
|
||||||
from re import RegexFlag
|
from re import RegexFlag
|
||||||
|
@ -32,6 +34,8 @@ def step_server_config(context, server_fqdn, server_port):
|
||||||
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
|
context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
|
||||||
|
|
||||||
context.model_alias = None
|
context.model_alias = None
|
||||||
|
context.model_file = None
|
||||||
|
context.model_url = None
|
||||||
context.n_batch = None
|
context.n_batch = None
|
||||||
context.n_ubatch = None
|
context.n_ubatch = None
|
||||||
context.n_ctx = None
|
context.n_ctx = None
|
||||||
|
@ -65,6 +69,16 @@ def step_download_hf_model(context, hf_file, hf_repo):
|
||||||
print(f"model file: {context.model_file}\n")
|
print(f"model file: {context.model_file}\n")
|
||||||
|
|
||||||
|
|
||||||
|
@step('a model file {model_file}')
|
||||||
|
def step_model_file(context, model_file):
|
||||||
|
context.model_file = model_file
|
||||||
|
|
||||||
|
|
||||||
|
@step('a model url {model_url}')
|
||||||
|
def step_model_url(context, model_url):
|
||||||
|
context.model_url = model_url
|
||||||
|
|
||||||
|
|
||||||
@step('a model alias {model_alias}')
|
@step('a model alias {model_alias}')
|
||||||
def step_model_alias(context, model_alias):
|
def step_model_alias(context, model_alias):
|
||||||
context.model_alias = model_alias
|
context.model_alias = model_alias
|
||||||
|
@ -141,7 +155,8 @@ def step_start_server(context):
|
||||||
async def step_wait_for_the_server_to_be_started(context, expecting_status):
|
async def step_wait_for_the_server_to_be_started(context, expecting_status):
|
||||||
match expecting_status:
|
match expecting_status:
|
||||||
case 'healthy':
|
case 'healthy':
|
||||||
await wait_for_health_status(context, context.base_url, 200, 'ok')
|
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
||||||
|
timeout=30)
|
||||||
|
|
||||||
case 'ready' | 'idle':
|
case 'ready' | 'idle':
|
||||||
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
await wait_for_health_status(context, context.base_url, 200, 'ok',
|
||||||
|
@ -1038,8 +1053,11 @@ def start_server_background(context):
|
||||||
server_args = [
|
server_args = [
|
||||||
'--host', server_listen_addr,
|
'--host', server_listen_addr,
|
||||||
'--port', context.server_port,
|
'--port', context.server_port,
|
||||||
'--model', context.model_file
|
|
||||||
]
|
]
|
||||||
|
if context.model_file:
|
||||||
|
server_args.extend(['--model', context.model_file])
|
||||||
|
if context.model_url:
|
||||||
|
server_args.extend(['--model-url', context.model_url])
|
||||||
if context.n_batch:
|
if context.n_batch:
|
||||||
server_args.extend(['--batch-size', context.n_batch])
|
server_args.extend(['--batch-size', context.n_batch])
|
||||||
if context.n_ubatch:
|
if context.n_ubatch:
|
||||||
|
@ -1079,8 +1097,23 @@ def start_server_background(context):
|
||||||
|
|
||||||
pkwargs = {
|
pkwargs = {
|
||||||
'creationflags': flags,
|
'creationflags': flags,
|
||||||
|
'stdout': subprocess.PIPE,
|
||||||
|
'stderr': subprocess.PIPE
|
||||||
}
|
}
|
||||||
context.server_process = subprocess.Popen(
|
context.server_process = subprocess.Popen(
|
||||||
[str(arg) for arg in [context.server_path, *server_args]],
|
[str(arg) for arg in [context.server_path, *server_args]],
|
||||||
**pkwargs)
|
**pkwargs)
|
||||||
|
|
||||||
|
def log_stdout(process):
|
||||||
|
for line in iter(process.stdout.readline, b''):
|
||||||
|
print(line.decode('utf-8'), end='')
|
||||||
|
thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
|
||||||
|
thread_stdout.start()
|
||||||
|
|
||||||
|
def log_stderr(process):
|
||||||
|
for line in iter(process.stderr.readline, b''):
|
||||||
|
print(line.decode('utf-8'), end='', file=sys.stderr)
|
||||||
|
thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
|
||||||
|
thread_stderr.start()
|
||||||
|
|
||||||
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
|
print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
|
||||||
|
|
|
@ -3,4 +3,5 @@ behave~=1.2.6
|
||||||
huggingface_hub~=0.20.3
|
huggingface_hub~=0.20.3
|
||||||
numpy~=1.24.4
|
numpy~=1.24.4
|
||||||
openai~=0.25.0
|
openai~=0.25.0
|
||||||
|
psutil~=5.9.8
|
||||||
prometheus-client~=0.20.0
|
prometheus-client~=0.20.0
|
||||||
|
|
|
@ -13,8 +13,11 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
#for FP32
|
#for FP32
|
||||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
#build example/main only
|
#build example/main
|
||||||
#cmake --build . --config Release --target main
|
#cmake --build . --config Release --target main
|
||||||
|
|
||||||
|
#build example/llama-bench
|
||||||
|
#cmake --build . --config Release --target llama-bench
|
||||||
|
|
||||||
#build all binary
|
#build all binary
|
||||||
cmake --build . --config Release -v
|
cmake --build . --config Release -v
|
||||||
|
|
|
@ -9,18 +9,28 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
if [ $# -gt 0 ]; then
|
if [ $# -gt 0 ]; then
|
||||||
GGML_SYCL_DEVICE=$1
|
GGML_SYCL_DEVICE=$1
|
||||||
|
GGML_SYCL_SINGLE_GPU=1
|
||||||
else
|
else
|
||||||
GGML_SYCL_DEVICE=0
|
GGML_SYCL_DEVICE=0
|
||||||
fi
|
fi
|
||||||
echo "use $GGML_SYCL_DEVICE as main GPU"
|
|
||||||
#export GGML_SYCL_DEBUG=1
|
#export GGML_SYCL_DEBUG=1
|
||||||
|
|
||||||
|
|
||||||
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer.
|
||||||
|
|
||||||
#use all GPUs with same max compute units
|
if [ $GGML_SYCL_SINGLE_GPU -eq 1 ]; then
|
||||||
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
echo "use $GGML_SYCL_DEVICE as main GPU"
|
||||||
|
#use signle GPU only
|
||||||
|
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||||
|
else
|
||||||
|
#use multiple GPUs with same max compute units
|
||||||
|
ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||||
|
fi
|
||||||
|
|
||||||
#use main GPU only
|
#use main GPU only
|
||||||
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none
|
||||||
|
|
||||||
|
#use multiple GPUs with same max compute units
|
||||||
|
#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||||
|
|
||||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1709703039,
|
"lastModified": 1710451336,
|
||||||
"narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=",
|
"narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d",
|
"rev": "d691274a972b3165335d261cc4671335f5c67de9",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
|
10
ggml-alloc.c
10
ggml-alloc.c
|
@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
|
||||||
if (ggml_is_view(node)) {
|
// TODO: better way to add external dependencies
|
||||||
|
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
|
||||||
|
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
|
||||||
|
// itself is never used and should not be considered a dependency
|
||||||
|
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
|
||||||
struct ggml_tensor * view_src = node->view_src;
|
struct ggml_tensor * view_src = node->view_src;
|
||||||
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
|
||||||
}
|
}
|
||||||
|
@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
||||||
|
|
||||||
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
ggml_gallocr_hash_get(galloc, src)->n_children += 1;
|
||||||
|
|
||||||
// allocate explicit inputs and leafs
|
// allocate explicit inputs
|
||||||
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
|
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||||
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -103,6 +103,11 @@ extern "C" {
|
||||||
// check if the backend supports an operation
|
// check if the backend supports an operation
|
||||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
|
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
||||||
|
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
||||||
|
// even if the weight has to be copied from the CPU temporarily
|
||||||
|
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
// (optional) event synchronization
|
// (optional) event synchronization
|
||||||
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
||||||
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
||||||
|
|
276
ggml-backend.c
276
ggml-backend.c
|
@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
|
||||||
return err;
|
return err;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
return backend->iface.graph_compute(backend, cgraph);
|
return backend->iface.graph_compute(backend, cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor *
|
||||||
return backend->iface.supports_op(backend, op);
|
return backend->iface.supports_op(backend, op);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
if (backend->iface.offload_op != NULL) {
|
||||||
|
return backend->iface.offload_op(backend, op);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// backend copy
|
// backend copy
|
||||||
|
|
||||||
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
|
||||||
|
@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
if (cpu_plan->cplan.work_size > 0) {
|
||||||
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
||||||
|
if (cpu_plan->cplan.work_data == NULL) {
|
||||||
|
free(cpu_plan);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
||||||
|
@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = {
|
||||||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
/* .supports_op = */ ggml_backend_cpu_supports_op,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
|
@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_SPLITS
|
#ifndef GGML_SCHED_MAX_SPLITS
|
||||||
#define GGML_SCHED_MAX_SPLITS 256
|
#define GGML_SCHED_MAX_SPLITS 2048
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
|
||||||
#define GGML_SCHED_MAX_SPLIT_INPUTS 16
|
#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GGML_SCHED_MAX_COPIES
|
#ifndef GGML_SCHED_MAX_COPIES
|
||||||
|
@ -1043,8 +1055,9 @@ struct ggml_backend_sched {
|
||||||
struct ggml_cgraph * graph;
|
struct ggml_cgraph * graph;
|
||||||
|
|
||||||
// graph splits
|
// graph splits
|
||||||
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
|
struct ggml_backend_sched_split * splits;
|
||||||
int n_splits;
|
int n_splits;
|
||||||
|
int splits_capacity;
|
||||||
|
|
||||||
// pipeline parallelism support
|
// pipeline parallelism support
|
||||||
int n_copies;
|
int n_copies;
|
||||||
|
@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||||
// TODO: use supports_op to check if the backend supports the op
|
// TODO: use supports_op to check if the backend supports the op
|
||||||
|
|
||||||
// assign pre-allocated nodes to their backend
|
// assign pre-allocated nodes to their backend
|
||||||
// dst
|
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
||||||
int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor);
|
if (cur_backend_id != -1) {
|
||||||
if (cur_backend != -1) {
|
|
||||||
SET_CAUSE(tensor, "1.dst");
|
SET_CAUSE(tensor, "1.dst");
|
||||||
return cur_backend;
|
return cur_backend_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// view_src
|
// view_src
|
||||||
if (tensor->view_src != NULL) {
|
if (tensor->view_src != NULL) {
|
||||||
cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
|
||||||
if (cur_backend != -1) {
|
if (cur_backend_id != -1) {
|
||||||
SET_CAUSE(tensor, "1.vsrc");
|
SET_CAUSE(tensor, "1.vsrc");
|
||||||
return cur_backend;
|
return cur_backend_id;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// input
|
// graph input
|
||||||
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
|
||||||
cur_backend = sched->n_backends - 1; // last backend (assumed CPU)
|
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
||||||
SET_CAUSE(tensor, "1.inp");
|
SET_CAUSE(tensor, "1.inp");
|
||||||
return cur_backend;
|
return cur_backend_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// assign nodes that use weights to the backend of the weights
|
// assign nodes that use weights to the backend of the weights
|
||||||
|
// operations with weights are preferably run on the same backend as the weights
|
||||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
const struct ggml_tensor * src = tensor->src[i];
|
const struct ggml_tensor * src = tensor->src[i];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
|
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
|
||||||
// operations with weights are always run on the same backend as the weights
|
// check if a backend with higher prio wants to offload the op
|
||||||
|
if (src_backend_id == sched->n_backends - 1) {
|
||||||
|
for (int b = 0; b < src_backend_id; b++) {
|
||||||
|
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
|
||||||
|
SET_CAUSE(tensor, "1.off");
|
||||||
|
return b;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
SET_CAUSE(tensor, "1.wgt%d", i);
|
SET_CAUSE(tensor, "1.wgt%d", i);
|
||||||
return src_backend;
|
return src_backend_id;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
// pass 1: assign backends to ops with pre-allocated inputs
|
// pass 1: assign backends to ops with pre-allocated inputs
|
||||||
for (int i = 0; i < graph->n_leafs; i++) {
|
for (int i = 0; i < graph->n_leafs; i++) {
|
||||||
struct ggml_tensor * leaf = graph->leafs[i];
|
struct ggml_tensor * leaf = graph->leafs[i];
|
||||||
if (tensor_backend_id(leaf) != -1) {
|
int * leaf_backend_id = &tensor_backend_id(leaf);
|
||||||
|
if (*leaf_backend_id != -1) {
|
||||||
// do not overwrite user assignments
|
// do not overwrite user assignments
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
*leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (tensor_backend_id(node) != -1) {
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
|
if (*node_backend_id != -1) {
|
||||||
// do not overwrite user assignments
|
// do not overwrite user assignments
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
|
*node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node);
|
||||||
// src
|
// src
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
struct ggml_tensor * src = node->src[j];
|
struct ggml_tensor * src = node->src[j];
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (tensor_backend_id(src) == -1) {
|
int * src_backend_id = &tensor_backend_id(src);
|
||||||
tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
|
if (*src_backend_id == -1) {
|
||||||
|
*src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (tensor_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
if (tensor_backend_id == sched->n_backends - 1) {
|
if (*node_backend_id == sched->n_backends - 1) {
|
||||||
// skip cpu (lowest prio backend)
|
// skip cpu (lowest prio backend)
|
||||||
cur_backend_id = -1;
|
cur_backend_id = -1;
|
||||||
} else {
|
} else {
|
||||||
cur_backend_id = tensor_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(node) = cur_backend_id;
|
*node_backend_id = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.2");
|
SET_CAUSE(node, "2.2");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// pass 2.1 expand gpu up
|
// pass 2.1 expand gpu up
|
||||||
{
|
{
|
||||||
int cur_backend_id = -1;
|
int cur_backend_id = -1;
|
||||||
|
@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (tensor_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
if (tensor_backend_id == sched->n_backends - 1) {
|
if (*node_backend_id == sched->n_backends - 1) {
|
||||||
// skip cpu (lowest prio backend)
|
// skip cpu (lowest prio backend)
|
||||||
cur_backend_id = -1;
|
cur_backend_id = -1;
|
||||||
} else {
|
} else {
|
||||||
cur_backend_id = tensor_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(node) = cur_backend_id;
|
*node_backend_id = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.1");
|
SET_CAUSE(node, "2.1");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// pass 2.4 expand rest down
|
// pass 2.4 expand rest down
|
||||||
{
|
{
|
||||||
int cur_backend_id = -1;
|
int cur_backend_id = -1;
|
||||||
|
@ -1317,11 +1338,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (tensor_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
cur_backend_id = tensor_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(node) = cur_backend_id;
|
*node_backend_id = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.4");
|
SET_CAUSE(node, "2.4");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (ggml_is_view_op(node->op)) {
|
if (ggml_is_view_op(node->op)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
int * node_backend_id = &tensor_backend_id(node);
|
||||||
if (tensor_backend_id != -1) {
|
if (*node_backend_id != -1) {
|
||||||
cur_backend_id = tensor_backend_id;
|
cur_backend_id = *node_backend_id;
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(node) = cur_backend_id;
|
*node_backend_id = cur_backend_id;
|
||||||
SET_CAUSE(node, "2.3");
|
SET_CAUSE(node, "2.3");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
// pass 3: assign backends to remaining src from dst and view_src
|
// pass 3: assign backends to remaining src from dst and view_src
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
int cur_backend_id = tensor_backend_id(node);
|
int * cur_backend_id = &tensor_backend_id(node);
|
||||||
if (node->view_src != NULL && cur_backend_id == -1) {
|
if (node->view_src != NULL && *cur_backend_id == -1) {
|
||||||
cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
|
*cur_backend_id = tensor_backend_id(node->view_src);
|
||||||
SET_CAUSE(node, "3.vsrc");
|
SET_CAUSE(node, "3.vsrc");
|
||||||
}
|
}
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
|
@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
if (src == NULL) {
|
if (src == NULL) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
int src_backend_id = tensor_backend_id(src);
|
int * src_backend_id = &tensor_backend_id(src);
|
||||||
if (src_backend_id == -1) {
|
if (*src_backend_id == -1) {
|
||||||
if (src->view_src != NULL) {
|
if (src->view_src != NULL) {
|
||||||
// views are always on the same backend as the source
|
// views are always on the same backend as the source
|
||||||
tensor_backend_id(src) = tensor_backend_id(src->view_src);
|
*src_backend_id = tensor_backend_id(src->view_src);
|
||||||
SET_CAUSE(src, "3.vsrc");
|
SET_CAUSE(src, "3.vsrc");
|
||||||
} else {
|
} else {
|
||||||
tensor_backend_id(src) = cur_backend_id;
|
*src_backend_id = *cur_backend_id;
|
||||||
SET_CAUSE(src, "3.cur");
|
SET_CAUSE(src, "3.cur");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
|
|
||||||
// pass 4: split graph, find tensors that need to be copied
|
// pass 4: split graph, find tensors that need to be copied
|
||||||
{
|
{
|
||||||
int cur_split = 0;
|
int i_split = 0;
|
||||||
|
struct ggml_backend_sched_split * split = &sched->splits[0];
|
||||||
// find the backend of the first split, skipping view ops
|
// find the backend of the first split, skipping view ops
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
if (!ggml_is_view_op(node->op)) {
|
if (!ggml_is_view_op(node->op)) {
|
||||||
sched->splits[0].backend_id = tensor_backend_id(node);
|
split->backend_id = tensor_backend_id(node);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sched->splits[0].i_start = 0;
|
split->i_start = 0;
|
||||||
sched->splits[0].n_inputs = 0;
|
split->n_inputs = 0;
|
||||||
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
|
memset(split->inputs, 0, sizeof(split->inputs)); //HACK
|
||||||
int cur_backend_id = sched->splits[0].backend_id;
|
int cur_backend_id = split->backend_id;
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
for (int i = 0; i < graph->n_nodes; i++) {
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
struct ggml_tensor * node = graph->nodes[i];
|
||||||
|
|
||||||
|
@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int tensor_backend_id = tensor_backend_id(node);
|
const int node_backend_id = tensor_backend_id(node);
|
||||||
|
|
||||||
GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
|
GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now
|
||||||
|
|
||||||
if (tensor_backend_id != cur_backend_id) {
|
// check if we should start a new split based on the sources of the current node
|
||||||
sched->splits[cur_split].i_end = i;
|
bool need_new_split = false;
|
||||||
cur_split++;
|
if (node_backend_id == cur_backend_id && split->n_inputs > 0) {
|
||||||
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
|
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
||||||
sched->splits[cur_split].backend_id = tensor_backend_id;
|
struct ggml_tensor * src = node->src[j];
|
||||||
sched->splits[cur_split].i_start = i;
|
if (src == NULL) {
|
||||||
sched->splits[cur_split].n_inputs = 0;
|
continue;
|
||||||
cur_backend_id = tensor_backend_id;
|
}
|
||||||
|
// check if a weight is on a different backend
|
||||||
|
// by starting a new split, the memory of the previously offloaded weights can be reused
|
||||||
|
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
|
||||||
|
int src_backend_id = tensor_backend_id(src);
|
||||||
|
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
|
||||||
|
need_new_split = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// check if the split has too many inputs
|
||||||
|
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
|
||||||
|
const size_t id = hash_id(src);
|
||||||
|
int src_backend_id = sched->tensor_backend_id[id];
|
||||||
|
if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
|
||||||
|
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
|
||||||
|
need_new_split = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node_backend_id != cur_backend_id || need_new_split) {
|
||||||
|
split->i_end = i;
|
||||||
|
i_split++;
|
||||||
|
if (i_split >= sched->splits_capacity) {
|
||||||
|
sched->splits_capacity *= 2;
|
||||||
|
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
|
||||||
|
GGML_ASSERT(sched->splits != NULL);
|
||||||
|
}
|
||||||
|
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
|
||||||
|
split = &sched->splits[i_split];
|
||||||
|
split->backend_id = node_backend_id;
|
||||||
|
split->i_start = i;
|
||||||
|
split->n_inputs = 0;
|
||||||
|
cur_backend_id = node_backend_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find inputs that are not on the same backend
|
// find inputs that are not on the same backend
|
||||||
|
@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
int src_backend_id = tensor_backend_id(src);
|
const int src_backend_id = tensor_backend_id(src);
|
||||||
assert(src_backend_id != -1); // all inputs should be assigned by now
|
assert(src_backend_id != -1); // all inputs should be assigned by now
|
||||||
|
|
||||||
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
|
if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
|
||||||
size_t id = hash_id(src);
|
size_t id = hash_id(src);
|
||||||
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
|
||||||
ggml_backend_t backend = sched->backends[src_backend_id];
|
ggml_backend_t backend = sched->backends[src_backend_id];
|
||||||
|
@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||||
}
|
}
|
||||||
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
sched->tensor_copies[id][src_backend_id][c] = tensor_copy;
|
||||||
tensor_backend_id(tensor_copy) = src_backend_id;
|
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
}
|
}
|
||||||
int n_graph_inputs = sched->n_graph_inputs++;
|
int n_graph_inputs = sched->n_graph_inputs++;
|
||||||
|
@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src_backend_id != tensor_backend_id) {
|
if (src_backend_id != node_backend_id) {
|
||||||
// create a copy of the input in the split's backend
|
// create a copy of the input in the split's backend
|
||||||
size_t id = hash_id(src);
|
const size_t id = hash_id(src);
|
||||||
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
|
||||||
ggml_backend_t backend = sched->backends[cur_backend_id];
|
ggml_backend_t backend = sched->backends[cur_backend_id];
|
||||||
for (int c = 0; c < sched->n_copies; c++) {
|
for (int c = 0; c < sched->n_copies; c++) {
|
||||||
|
@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor
|
||||||
}
|
}
|
||||||
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
sched->tensor_copies[id][cur_backend_id][c] = tensor_copy;
|
||||||
tensor_backend_id(tensor_copy) = cur_backend_id;
|
|
||||||
SET_CAUSE(tensor_copy, "4.cpy");
|
SET_CAUSE(tensor_copy, "4.cpy");
|
||||||
}
|
}
|
||||||
int n_inputs = sched->splits[cur_split].n_inputs++;
|
int n_inputs = split->n_inputs++;
|
||||||
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS);
|
||||||
sched->splits[cur_split].inputs[n_inputs] = src;
|
split->inputs[n_inputs] = src;
|
||||||
}
|
}
|
||||||
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
sched->splits[cur_split].i_end = graph->n_nodes;
|
split->i_end = graph->n_nodes;
|
||||||
sched->n_splits = cur_split + 1;
|
sched->n_splits = i_split + 1;
|
||||||
}
|
}
|
||||||
#ifdef DEBUG_PASS4
|
#ifdef DEBUG_PASS4
|
||||||
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef NDEBUG
|
|
||||||
// sanity check: all sources should have the same backend as the node
|
|
||||||
for (int i = 0; i < graph->n_nodes; i++) {
|
|
||||||
struct ggml_tensor * node = graph->nodes[i];
|
|
||||||
ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
|
|
||||||
if (tensor_backend == NULL) {
|
|
||||||
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
|
|
||||||
}
|
|
||||||
if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) {
|
|
||||||
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
|
|
||||||
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
|
||||||
node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ?
|
|
||||||
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL");
|
|
||||||
}
|
|
||||||
for (int j = 0; j < GGML_MAX_SRC; j++) {
|
|
||||||
struct ggml_tensor * src = node->src[j];
|
|
||||||
if (src == NULL) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
|
|
||||||
if (src_backend != tensor_backend /* && src_backend != NULL */) {
|
|
||||||
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
|
|
||||||
node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
|
|
||||||
j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
|
|
||||||
}
|
|
||||||
if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) {
|
|
||||||
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
|
|
||||||
src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
|
|
||||||
src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ?
|
|
||||||
ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fflush(stderr);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// create copies of the graph for each split
|
// create copies of the graph for each split
|
||||||
// TODO: avoid this copy
|
// TODO: avoid this copy
|
||||||
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false);
|
struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false);
|
||||||
for (int i = 0; i < sched->n_splits; i++) {
|
for (int i = 0; i < sched->n_splits; i++) {
|
||||||
struct ggml_backend_sched_split * split = &sched->splits[i];
|
struct ggml_backend_sched_split * split = &sched->splits[i];
|
||||||
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
|
||||||
|
|
||||||
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
// add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
|
||||||
for (int j = 0; j < split->n_inputs; j++) {
|
for (int j = 0; j < split->n_inputs; j++) {
|
||||||
|
assert(graph_copy->size > (graph_copy->n_nodes + 1));
|
||||||
|
|
||||||
struct ggml_tensor * input = split->inputs[j];
|
struct ggml_tensor * input = split->inputs[j];
|
||||||
struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy];
|
const size_t input_id = hash_id(input);
|
||||||
|
struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy];
|
||||||
|
|
||||||
// add a dependency to the input source so that it is not freed before the copy is done
|
// add a dependency to the input source so that it is not freed before the copy is done
|
||||||
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
|
||||||
input_dep->src[0] = input;
|
input_dep->src[0] = input;
|
||||||
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
|
sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id];
|
||||||
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
|
||||||
|
|
||||||
// add a dependency to the input copy so that it is allocated at the start of the split
|
// add a dependency to the input copy so that it is allocated at the start of the split
|
||||||
|
@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int j = split->i_start; j < split->i_end; j++) {
|
for (int j = split->i_start; j < split->i_end; j++) {
|
||||||
|
assert(graph_copy->size > graph_copy->n_nodes);
|
||||||
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
|
||||||
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
|
||||||
}
|
}
|
||||||
|
@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_copy(input, input_cpy);
|
ggml_backend_tensor_copy(input, input_cpy);
|
||||||
} else {
|
} else {
|
||||||
|
// wait for the split backend to finish using the input before overwriting it
|
||||||
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
|
||||||
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_synchronize(split_backend);
|
ggml_backend_synchronize(split_backend);
|
||||||
ggml_backend_synchronize(input_backend);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||||
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
|
||||||
|
|
||||||
// initialize hash table
|
// initialize hash table
|
||||||
sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
sched->hash_set = ggml_hash_set_new(graph_size);
|
||||||
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
|
||||||
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
|
||||||
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
|
|
||||||
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size);
|
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
||||||
|
sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size);
|
||||||
|
sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size);
|
||||||
|
|
||||||
sched->n_backends = n_backends;
|
sched->n_backends = n_backends;
|
||||||
|
|
||||||
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
|
||||||
|
|
||||||
GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
|
const int initial_splits_capacity = 16;
|
||||||
|
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
|
||||||
|
sched->splits_capacity = initial_splits_capacity;
|
||||||
|
|
||||||
for (int b = 0; b < n_backends; b++) {
|
for (int b = 0; b < n_backends; b++) {
|
||||||
sched->backends[b] = backends[b];
|
sched->backends[b] = backends[b];
|
||||||
|
@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
ggml_gallocr_free(sched->galloc);
|
ggml_gallocr_free(sched->galloc);
|
||||||
ggml_free(sched->ctx);
|
ggml_free(sched->ctx);
|
||||||
|
free(sched->splits);
|
||||||
free(sched->hash_set.keys);
|
free(sched->hash_set.keys);
|
||||||
free(sched->tensor_backend_id);
|
free(sched->tensor_backend_id);
|
||||||
free(sched->tensor_copies);
|
free(sched->tensor_copies);
|
||||||
|
@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
|
||||||
|
GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes);
|
||||||
|
|
||||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||||
|
|
||||||
// TODO: extract this to a separate function
|
// TODO: extract this to a separate function
|
||||||
|
@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
|
||||||
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS);
|
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes);
|
||||||
|
|
||||||
ggml_backend_sched_split_graph(sched, graph);
|
ggml_backend_sched_split_graph(sched, graph);
|
||||||
|
|
||||||
|
|
|
@ -70,11 +70,11 @@ extern "C" {
|
||||||
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
|
||||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
// tensor copy between different backends
|
// tensor copy between different backends
|
||||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
262
ggml-cuda.cu
262
ggml-cuda.cu
|
@ -82,6 +82,10 @@
|
||||||
#define cudaGetDeviceProperties hipGetDeviceProperties
|
#define cudaGetDeviceProperties hipGetDeviceProperties
|
||||||
#define cudaGetErrorString hipGetErrorString
|
#define cudaGetErrorString hipGetErrorString
|
||||||
#define cudaGetLastError hipGetLastError
|
#define cudaGetLastError hipGetLastError
|
||||||
|
#define cudaHostRegister hipHostRegister
|
||||||
|
#define cudaHostRegisterPortable hipHostRegisterPortable
|
||||||
|
#define cudaHostRegisterReadOnly hipHostRegisterReadOnly
|
||||||
|
#define cudaHostUnregister hipHostUnregister
|
||||||
#define cudaLaunchHostFunc hipLaunchHostFunc
|
#define cudaLaunchHostFunc hipLaunchHostFunc
|
||||||
#ifdef GGML_HIP_UMA
|
#ifdef GGML_HIP_UMA
|
||||||
#define cudaMalloc hipMallocManaged
|
#define cudaMalloc hipMallocManaged
|
||||||
|
@ -7787,11 +7791,7 @@ struct cuda_pool_alloc {
|
||||||
|
|
||||||
static bool g_cublas_loaded = false;
|
static bool g_cublas_loaded = false;
|
||||||
|
|
||||||
GGML_CALL bool ggml_cublas_loaded(void) {
|
static void ggml_init_cublas() {
|
||||||
return g_cublas_loaded;
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_CALL void ggml_init_cublas() {
|
|
||||||
static bool initialized = false;
|
static bool initialized = false;
|
||||||
|
|
||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
|
@ -7880,7 +7880,7 @@ GGML_CALL void ggml_init_cublas() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
|
static void * ggml_cuda_host_malloc(size_t size) {
|
||||||
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -7890,7 +7890,7 @@ GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
|
||||||
if (err != cudaSuccess) {
|
if (err != cudaSuccess) {
|
||||||
// clear the error
|
// clear the error
|
||||||
cudaGetLastError();
|
cudaGetLastError();
|
||||||
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
|
fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||||
size/1024.0/1024.0, cudaGetErrorString(err));
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -7898,7 +7898,7 @@ GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void ggml_cuda_host_free(void * ptr) {
|
static void ggml_cuda_host_free(void * ptr) {
|
||||||
CUDA_CHECK(cudaFreeHost(ptr));
|
CUDA_CHECK(cudaFreeHost(ptr));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9036,21 +9036,13 @@ static void ggml_cuda_op_soft_max(
|
||||||
|
|
||||||
// positions tensor
|
// positions tensor
|
||||||
float * src2_dd = nullptr;
|
float * src2_dd = nullptr;
|
||||||
cuda_pool_alloc<float> src2_f;
|
|
||||||
|
|
||||||
ggml_tensor * src2 = dst->src[2];
|
ggml_tensor * src2 = dst->src[2];
|
||||||
const bool use_src2 = src2 != nullptr;
|
const bool use_src2 = src2 != nullptr;
|
||||||
|
|
||||||
if (use_src2) {
|
if (use_src2) {
|
||||||
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
|
|
||||||
|
|
||||||
if (src2_on_device) {
|
|
||||||
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
|
||||||
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
src2_dd = (float *) src2_extra->data_device[g_main_device];
|
||||||
} else {
|
|
||||||
src2_dd = src2_f.alloc(ggml_nelements(src2));
|
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
|
soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream);
|
||||||
|
@ -9107,55 +9099,24 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
|
||||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
|
||||||
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
|
||||||
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU;
|
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
|
||||||
|
|
||||||
// dd = data device
|
// dd = data device
|
||||||
float * src0_ddf = nullptr;
|
float * src0_ddf = nullptr;
|
||||||
float * src1_ddf = nullptr;
|
float * src1_ddf = nullptr;
|
||||||
float * dst_ddf = nullptr;
|
float * dst_ddf = nullptr;
|
||||||
|
|
||||||
cuda_pool_alloc<float> src0_f;
|
|
||||||
cuda_pool_alloc<float> src1_f;
|
|
||||||
cuda_pool_alloc<float> dst_f;
|
|
||||||
|
|
||||||
ggml_cuda_set_device(g_main_device);
|
ggml_cuda_set_device(g_main_device);
|
||||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
if (src0_on_device) {
|
|
||||||
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
src0_ddf = (float *) src0_extra->data_device[g_main_device];
|
||||||
} else {
|
|
||||||
src0_ddf = src0_f.alloc(ggml_nelements(src0));
|
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (use_src1) {
|
if (use_src1) {
|
||||||
if (src1_on_device) {
|
|
||||||
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
src1_ddf = (float *) src1_extra->data_device[g_main_device];
|
||||||
} else {
|
|
||||||
src1_ddf = src1_f.alloc(ggml_nelements(src1));
|
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
if (dst_on_device) {
|
|
||||||
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||||
} else {
|
|
||||||
dst_ddf = dst_f.alloc(ggml_nelements(dst));
|
|
||||||
}
|
|
||||||
|
|
||||||
// do the computation
|
// do the computation
|
||||||
op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
// copy dst to host if necessary
|
|
||||||
if (!dst_on_device) {
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_set_peer_access(const int n_tokens) {
|
static void ggml_cuda_set_peer_access(const int n_tokens) {
|
||||||
|
@ -9251,7 +9212,6 @@ static void ggml_cuda_op_mul_mat(
|
||||||
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
|
||||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
|
||||||
const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
|
||||||
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
const bool src0_is_contiguous = ggml_is_contiguous(src0);
|
||||||
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
const bool src1_is_contiguous = ggml_is_contiguous(src1);
|
||||||
|
|
||||||
|
@ -9322,13 +9282,13 @@ static void ggml_cuda_op_mul_mat(
|
||||||
|
|
||||||
used_devices++;
|
used_devices++;
|
||||||
|
|
||||||
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
const bool src1_on_device = id == g_main_device; // TODO: check from buffer
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
const bool dst_on_device = id == g_main_device;
|
||||||
|
|
||||||
ggml_cuda_set_device(id);
|
ggml_cuda_set_device(id);
|
||||||
cudaStream_t stream = g_cudaStreams[id][0];
|
cudaStream_t stream = g_cudaStreams[id][0];
|
||||||
|
|
||||||
if (src0_on_device && src0_is_contiguous) {
|
if (src0_is_contiguous) {
|
||||||
dev[id].src0_dd = (char *) src0_extra->data_device[id];
|
dev[id].src0_dd = (char *) src0_extra->data_device[id];
|
||||||
} else {
|
} else {
|
||||||
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0));
|
dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0));
|
||||||
|
@ -9374,8 +9334,8 @@ static void ggml_cuda_op_mul_mat(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
const bool src1_on_device = id == g_main_device; // TODO: check from buffer
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device;
|
const bool dst_on_device = id == g_main_device;
|
||||||
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
const int64_t row_diff = dev[id].row_high - dev[id].row_low;
|
||||||
|
|
||||||
ggml_cuda_set_device(id);
|
ggml_cuda_set_device(id);
|
||||||
|
@ -9400,12 +9360,12 @@ static void ggml_cuda_op_mul_mat(
|
||||||
|
|
||||||
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
// the main device memory buffer can be on VRAM scratch, with space for all partial results
|
||||||
// in that case an offset on dst_ddf_i is needed
|
// in that case an offset on dst_ddf_i is needed
|
||||||
if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) {
|
if (id == g_main_device) {
|
||||||
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy src0, src1 to device if necessary
|
// copy src0, src1 to device if necessary
|
||||||
if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) {
|
if (src1_is_contiguous) {
|
||||||
if (id != g_main_device) {
|
if (id != g_main_device) {
|
||||||
if (convert_src1_to_q8_1) {
|
if (convert_src1_to_q8_1) {
|
||||||
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
|
char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset;
|
||||||
|
@ -9418,19 +9378,19 @@ static void ggml_cuda_op_mul_mat(
|
||||||
src1_ncols*ne10*sizeof(float), stream));
|
src1_ncols*ne10*sizeof(float), stream));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) {
|
} else if (src1_on_device && !src1_is_contiguous) {
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
|
||||||
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) {
|
if (convert_src1_to_q8_1 && !src1_is_contiguous) {
|
||||||
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
|
if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) {
|
||||||
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
|
CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9441,17 +9401,7 @@ static void ggml_cuda_op_mul_mat(
|
||||||
|
|
||||||
// copy dst to host or other device if necessary
|
// copy dst to host or other device if necessary
|
||||||
if (!dst_on_device) {
|
if (!dst_on_device) {
|
||||||
void * dst_off_device;
|
void * dst_off_device = dst_extra->data_device[g_main_device];
|
||||||
cudaMemcpyKind kind;
|
|
||||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
||||||
dst_off_device = dst->data;
|
|
||||||
kind = cudaMemcpyDeviceToHost;
|
|
||||||
} else if (dst->backend == GGML_BACKEND_TYPE_GPU) {
|
|
||||||
dst_off_device = dst_extra->data_device[g_main_device];
|
|
||||||
kind = cudaMemcpyDeviceToDevice;
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
|
||||||
if (split) {
|
if (split) {
|
||||||
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
|
// src0 = weight matrix is saved as a transposed matrix for better memory layout.
|
||||||
// dst is NOT transposed.
|
// dst is NOT transposed.
|
||||||
|
@ -9462,7 +9412,6 @@ static void ggml_cuda_op_mul_mat(
|
||||||
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
||||||
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
|
dhf_dst_i += src1_col_0*ne0 + dev[id].row_low;
|
||||||
#if !defined(GGML_USE_HIPBLAS)
|
#if !defined(GGML_USE_HIPBLAS)
|
||||||
if (kind == cudaMemcpyDeviceToDevice) {
|
|
||||||
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
||||||
cudaMemcpy3DPeerParms p = {};
|
cudaMemcpy3DPeerParms p = {};
|
||||||
p.dstDevice = g_main_device;
|
p.dstDevice = g_main_device;
|
||||||
|
@ -9471,19 +9420,18 @@ static void ggml_cuda_op_mul_mat(
|
||||||
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
|
p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols);
|
||||||
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
|
p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1);
|
||||||
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
|
CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream));
|
||||||
} else
|
#else
|
||||||
#endif
|
// HIP does not support cudaMemcpy3DPeerAsync or vmm pools
|
||||||
{
|
|
||||||
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
|
CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float),
|
||||||
dst_dd_i, row_diff*sizeof(float),
|
dst_dd_i, row_diff*sizeof(float),
|
||||||
row_diff*sizeof(float), src1_ncols,
|
row_diff*sizeof(float), src1_ncols,
|
||||||
kind, stream));
|
cudaMemcpyDeviceToDevice, stream));
|
||||||
}
|
#endif
|
||||||
} else {
|
} else {
|
||||||
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
|
||||||
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
|
||||||
dhf_dst_i += src1_col_0*ne0;
|
dhf_dst_i += src1_col_0*ne0;
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream));
|
CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9510,11 +9458,6 @@ static void ggml_cuda_op_mul_mat(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
||||||
ggml_cuda_set_device(g_main_device);
|
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
@ -9599,36 +9542,19 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
|
||||||
static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU;
|
|
||||||
|
|
||||||
// dd = data device
|
// dd = data device
|
||||||
float * src0_ddf = nullptr;
|
float * src0_ddf = nullptr;
|
||||||
float * src1_ddf = nullptr;
|
float * src1_ddf = nullptr;
|
||||||
float * dst_ddf = nullptr;
|
float * dst_ddf = nullptr;
|
||||||
|
|
||||||
cuda_pool_alloc<float> dst_f;
|
|
||||||
|
|
||||||
ggml_cuda_set_device(g_main_device);
|
ggml_cuda_set_device(g_main_device);
|
||||||
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
if (dst_on_device) {
|
|
||||||
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
dst_ddf = (float *) dst_extra->data_device[g_main_device];
|
||||||
} else {
|
|
||||||
dst_ddf = dst_f.alloc(ggml_nelements(dst));
|
|
||||||
}
|
|
||||||
|
|
||||||
// do the computation
|
// do the computation
|
||||||
ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
// copy dst to host if necessary
|
|
||||||
if (!dst_on_device) {
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
@ -9639,21 +9565,6 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
|
||||||
if (!g_cublas_loaded) return false;
|
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
|
||||||
|
|
||||||
const int64_t ne0 = dst->ne[0];
|
|
||||||
const int64_t ne1 = dst->ne[1];
|
|
||||||
|
|
||||||
// TODO: find the optimal values for these
|
|
||||||
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
|
||||||
src1->type == GGML_TYPE_F32 &&
|
|
||||||
dst->type == GGML_TYPE_F32 &&
|
|
||||||
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
||||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||||
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT);
|
||||||
|
@ -9891,11 +9802,6 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool all_on_device =
|
|
||||||
(src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) &&
|
|
||||||
(src1->backend == GGML_BACKEND_TYPE_GPU) &&
|
|
||||||
( dst->backend == GGML_BACKEND_TYPE_GPU);
|
|
||||||
|
|
||||||
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT;
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
|
@ -9972,13 +9878,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||||
|
|
||||||
if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||||
// KQ single-batch
|
// KQ single-batch
|
||||||
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
||||||
} else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
// KQV single-batch
|
// KQV single-batch
|
||||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||||
} else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
} else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
// KQ + KQV multi-batch
|
// KQ + KQV multi-batch
|
||||||
ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
|
ggml_cuda_mul_mat_batched_cublas(src0, src1, dst);
|
||||||
} else if (use_dequantize_mul_mat_vec) {
|
} else if (use_dequantize_mul_mat_vec) {
|
||||||
|
@ -10178,6 +10084,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
ggml_cuda_mul_mat_id_cublas(dst);
|
ggml_cuda_mul_mat_id_cublas(dst);
|
||||||
// TODO: mmq/mmv support
|
// TODO: mmq/mmv support
|
||||||
#endif
|
#endif
|
||||||
|
cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
||||||
|
|
||||||
const size_t nb11 = src1->nb[1];
|
const size_t nb11 = src1->nb[1];
|
||||||
const size_t nb1 = dst->nb[1];
|
const size_t nb1 = dst->nb[1];
|
||||||
|
@ -10187,16 +10094,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
const int32_t n_as = ((int32_t *) dst->op_params)[1];
|
const int32_t n_as = ((int32_t *) dst->op_params)[1];
|
||||||
|
|
||||||
std::vector<char> ids_host(ggml_nbytes(ids));
|
std::vector<char> ids_host(ggml_nbytes(ids));
|
||||||
|
|
||||||
cudaStream_t stream = g_cudaStreams[g_main_device][0];
|
|
||||||
|
|
||||||
if (ids->backend == GGML_BACKEND_TYPE_GPU) {
|
|
||||||
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
|
||||||
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream));
|
||||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||||
} else {
|
|
||||||
memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
|
|
||||||
}
|
|
||||||
|
|
||||||
const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
|
const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
|
||||||
const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
|
const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
|
||||||
|
@ -10213,20 +10113,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
src1_row.extra = &src1_row_extra;
|
src1_row.extra = &src1_row_extra;
|
||||||
dst_row.extra = &dst_row_extra;
|
dst_row.extra = &dst_row_extra;
|
||||||
|
|
||||||
char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
char * src1_original = (char *) src1_extra->data_device[g_main_device];
|
||||||
(char *) src1->data : (char *) src1_extra->data_device[g_main_device];
|
char * dst_original = (char *) dst_extra->data_device[g_main_device];
|
||||||
char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
|
||||||
(char *) dst->data : (char *) dst_extra->data_device[g_main_device];
|
|
||||||
|
|
||||||
if (src1->ne[1] == 1) {
|
if (src1->ne[1] == 1) {
|
||||||
GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
|
|
||||||
GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU);
|
|
||||||
|
|
||||||
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
|
||||||
//int32_t row_id;
|
|
||||||
//CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
|
|
||||||
//CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
|
|
||||||
|
|
||||||
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
|
||||||
|
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||||
|
@ -10248,11 +10139,6 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
|
||||||
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
dst_row_extra.data_device[g_main_device] = dst_contiguous.get();
|
||||||
|
|
||||||
const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ?
|
|
||||||
cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice;
|
|
||||||
const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ?
|
|
||||||
cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice;
|
|
||||||
|
|
||||||
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
for (int32_t row_id = 0; row_id < n_as; ++row_id) {
|
||||||
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
const struct ggml_tensor * src0_row = dst->src[row_id + 2];
|
||||||
|
|
||||||
|
@ -10267,7 +10153,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
|
CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11,
|
||||||
nb11, src1_kind, stream));
|
nb11, cudaMemcpyDeviceToDevice, stream));
|
||||||
num_src1_rows++;
|
num_src1_rows++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -10299,15 +10185,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
||||||
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
GGML_ASSERT(row_id >= 0 && row_id < n_as);
|
||||||
|
|
||||||
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
|
CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1,
|
||||||
nb1, dst_kind, stream));
|
nb1, cudaMemcpyDeviceToDevice, stream));
|
||||||
num_src1_rows++;
|
num_src1_rows++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dst->backend == GGML_BACKEND_TYPE_CPU) {
|
|
||||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
@ -10435,7 +10317,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
|
||||||
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
|
static void ggml_cuda_set_main_device(const int main_device) {
|
||||||
if (main_device >= g_device_count) {
|
if (main_device >= g_device_count) {
|
||||||
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
|
||||||
main_device, g_device_count, g_main_device);
|
main_device, g_device_count, g_main_device);
|
||||||
|
@ -10450,18 +10332,9 @@ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
static bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) {
|
||||||
if (!g_cublas_loaded) return false;
|
if (!g_cublas_loaded) return false;
|
||||||
|
|
||||||
ggml_cuda_func_t func;
|
|
||||||
const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU
|
|
||||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT))
|
|
||||||
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU);
|
|
||||||
|
|
||||||
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tensor->op == GGML_OP_MUL_MAT) {
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
||||||
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
|
@ -10471,6 +10344,8 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_cuda_func_t func;
|
||||||
|
|
||||||
switch (tensor->op) {
|
switch (tensor->op) {
|
||||||
case GGML_OP_REPEAT:
|
case GGML_OP_REPEAT:
|
||||||
func = ggml_cuda_repeat;
|
func = ggml_cuda_repeat;
|
||||||
|
@ -10548,15 +10423,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||||
func = ggml_cuda_rms_norm;
|
func = ggml_cuda_rms_norm;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_MUL_MAT:
|
case GGML_OP_MUL_MAT:
|
||||||
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_mul_mat;
|
func = ggml_cuda_mul_mat;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_MUL_MAT_ID:
|
case GGML_OP_MUL_MAT_ID:
|
||||||
if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
func = ggml_cuda_mul_mat_id;
|
func = ggml_cuda_mul_mat_id;
|
||||||
break;
|
break;
|
||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
|
@ -10613,17 +10482,11 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st
|
||||||
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params->ith != 0) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
func(tensor->src[0], tensor->src[1], tensor);
|
func(tensor->src[0], tensor->src[1], tensor);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL int ggml_cuda_get_device_count() {
|
static int ggml_cuda_get_device_count() {
|
||||||
int device_count;
|
int device_count;
|
||||||
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -10631,7 +10494,7 @@ GGML_CALL int ggml_cuda_get_device_count() {
|
||||||
return device_count;
|
return device_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
static void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
|
||||||
cudaDeviceProp prop;
|
cudaDeviceProp prop;
|
||||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
|
||||||
snprintf(description, description_size, "%s", prop.name);
|
snprintf(description, description_size, "%s", prop.name);
|
||||||
|
@ -10736,6 +10599,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
||||||
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||||
|
|
||||||
if (padded_size > original_size && tensor->view_src == nullptr) {
|
if (padded_size > original_size && tensor->view_src == nullptr) {
|
||||||
|
ggml_cuda_set_device(ctx->device);
|
||||||
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10873,6 +10737,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
|
||||||
|
ggml_init_cublas();
|
||||||
|
|
||||||
// FIXME: this is not thread safe
|
// FIXME: this is not thread safe
|
||||||
if (device >= ggml_backend_cuda_get_device_count()) {
|
if (device >= ggml_backend_cuda_get_device_count()) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
|
@ -11157,6 +11023,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
|
||||||
|
ggml_init_cublas();
|
||||||
|
|
||||||
// FIXME: this is not thread safe
|
// FIXME: this is not thread safe
|
||||||
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
|
||||||
|
|
||||||
|
@ -11348,9 +11216,6 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||||
|
|
||||||
ggml_cuda_set_main_device(cuda_ctx->device);
|
ggml_cuda_set_main_device(cuda_ctx->device);
|
||||||
|
|
||||||
ggml_compute_params params = {};
|
|
||||||
params.type = GGML_TASK_TYPE_COMPUTE;
|
|
||||||
params.ith = 0;
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
@ -11372,7 +11237,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
bool ok = ggml_cuda_compute_forward(¶ms, node);
|
bool ok = ggml_cuda_compute_forward(node);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||||
}
|
}
|
||||||
|
@ -11509,6 +11374,14 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||||
|
const int min_batch_size = 32;
|
||||||
|
|
||||||
|
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
|
||||||
|
|
||||||
|
UNUSED(backend);
|
||||||
|
}
|
||||||
|
|
||||||
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
|
@ -11541,6 +11414,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
||||||
if (ggml_backend_is_cuda(event->backend)) {
|
if (ggml_backend_is_cuda(event->backend)) {
|
||||||
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0));
|
||||||
} else {
|
} else {
|
||||||
|
#if 0
|
||||||
// untested
|
// untested
|
||||||
auto wait_fn = [](void * user_data) {
|
auto wait_fn = [](void * user_data) {
|
||||||
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
ggml_backend_event_t event = (ggml_backend_event_t)user_data;
|
||||||
|
@ -11548,6 +11422,8 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
|
||||||
};
|
};
|
||||||
|
|
||||||
CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
|
CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event));
|
||||||
|
#endif
|
||||||
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11568,6 +11444,7 @@ static ggml_backend_i ggml_backend_cuda_interface = {
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
/* .supports_op = */ ggml_backend_cuda_supports_op,
|
||||||
|
/* .offload_op = */ ggml_backend_cuda_offload_op,
|
||||||
/* .event_new = */ ggml_backend_cuda_event_new,
|
/* .event_new = */ ggml_backend_cuda_event_new,
|
||||||
/* .event_free = */ ggml_backend_cuda_event_free,
|
/* .event_free = */ ggml_backend_cuda_event_free,
|
||||||
/* .event_record = */ ggml_backend_cuda_event_record,
|
/* .event_record = */ ggml_backend_cuda_event_record,
|
||||||
|
@ -11581,7 +11458,7 @@ static ggml_guid_t ggml_backend_cuda_guid() {
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
|
||||||
ggml_init_cublas(); // TODO: remove from ggml.c
|
ggml_init_cublas();
|
||||||
|
|
||||||
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
if (device < 0 || device >= ggml_cuda_get_device_count()) {
|
||||||
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
|
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
|
||||||
|
@ -11624,6 +11501,31 @@ GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, si
|
||||||
CUDA_CHECK(cudaMemGetInfo(free, total));
|
CUDA_CHECK(cudaMemGetInfo(free, total));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
||||||
|
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
// clear the error
|
||||||
|
cudaGetLastError();
|
||||||
|
|
||||||
|
fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
||||||
|
size/1024.0/1024.0, cudaGetErrorString(err));
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
|
||||||
|
cudaError_t err = cudaHostUnregister(buffer);
|
||||||
|
if (err != cudaSuccess) {
|
||||||
|
// clear the error
|
||||||
|
cudaGetLastError();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// backend registry
|
// backend registry
|
||||||
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
|
||||||
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
|
||||||
|
|
21
ggml-cuda.h
21
ggml-cuda.h
|
@ -17,29 +17,17 @@ extern "C" {
|
||||||
|
|
||||||
#define GGML_CUDA_MAX_DEVICES 16
|
#define GGML_CUDA_MAX_DEVICES 16
|
||||||
|
|
||||||
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
|
||||||
GGML_API GGML_CALL void ggml_init_cublas(void);
|
|
||||||
|
|
||||||
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
|
||||||
GGML_API GGML_CALL bool ggml_cublas_loaded(void);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
|
|
||||||
GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
|
||||||
GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
|
||||||
|
|
||||||
GGML_API GGML_CALL int ggml_cuda_get_device_count(void);
|
|
||||||
GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
|
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||||
|
|
||||||
|
// device buffer
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
// split tensor buffer that splits matrices by rows across multiple devices
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
||||||
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||||
|
|
||||||
|
@ -47,6 +35,9 @@ GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
||||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||||
|
|
||||||
|
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||||
|
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1951,6 +1951,7 @@ static struct ggml_backend_i kompute_backend_i = {
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
/* .supports_op = */ ggml_backend_kompute_supports_op,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
|
|
|
@ -2837,6 +2837,7 @@ static struct ggml_backend_i ggml_backend_metal_i = {
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_metal_supports_op,
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
|
|
262
ggml-sycl.cpp
262
ggml-sycl.cpp
|
@ -16,6 +16,7 @@
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <cstddef>
|
#include <cstddef>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <cstdlib>
|
||||||
#include <float.h>
|
#include <float.h>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
@ -24,10 +25,9 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
#include <regex>
|
||||||
|
|
||||||
#include <sycl/sycl.hpp>
|
#include <sycl/sycl.hpp>
|
||||||
#include <sycl/half_type.hpp>
|
#include <sycl/half_type.hpp>
|
||||||
|
@ -82,6 +82,30 @@ Following definition copied from DPCT head files, which are used by ggml-sycl.cp
|
||||||
#define __dpct_noinline__ __attribute__((noinline))
|
#define __dpct_noinline__ __attribute__((noinline))
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
std::string get_device_type_name(const sycl::device &Device) {
|
||||||
|
auto DeviceType = Device.get_info<sycl::info::device::device_type>();
|
||||||
|
switch (DeviceType) {
|
||||||
|
case sycl::info::device_type::cpu:
|
||||||
|
return "cpu";
|
||||||
|
case sycl::info::device_type::gpu:
|
||||||
|
return "gpu";
|
||||||
|
case sycl::info::device_type::host:
|
||||||
|
return "host";
|
||||||
|
case sycl::info::device_type::accelerator:
|
||||||
|
return "acc";
|
||||||
|
default:
|
||||||
|
return "unknown";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string get_device_backend_and_type(const sycl::device &device) {
|
||||||
|
std::stringstream device_type;
|
||||||
|
sycl::backend backend = device.get_backend();
|
||||||
|
device_type << backend << ":" << get_device_type_name(device);
|
||||||
|
return device_type.str();
|
||||||
|
}
|
||||||
|
|
||||||
namespace dpct
|
namespace dpct
|
||||||
{
|
{
|
||||||
typedef sycl::queue *queue_ptr;
|
typedef sycl::queue *queue_ptr;
|
||||||
|
@ -942,17 +966,65 @@ namespace dpct
|
||||||
|
|
||||||
private:
|
private:
|
||||||
mutable std::recursive_mutex m_mutex;
|
mutable std::recursive_mutex m_mutex;
|
||||||
|
static bool compare_dev(sycl::device &device1, sycl::device &device2)
|
||||||
|
{
|
||||||
|
dpct::device_info prop1;
|
||||||
|
dpct::get_device_info(prop1, device1);
|
||||||
|
dpct::device_info prop2;
|
||||||
|
dpct::get_device_info(prop2, device2);
|
||||||
|
return prop1.get_max_compute_units() > prop2.get_max_compute_units();
|
||||||
|
}
|
||||||
|
static int convert_backend_index(std::string & backend) {
|
||||||
|
if (backend == "ext_oneapi_level_zero:gpu") return 0;
|
||||||
|
if (backend == "opencl:gpu") return 1;
|
||||||
|
if (backend == "opencl:cpu") return 2;
|
||||||
|
if (backend == "opencl:acc") return 3;
|
||||||
|
printf("convert_backend_index: can't handle backend=%s\n", backend.c_str());
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
static bool compare_backend(std::string &backend1, std::string &backend2) {
|
||||||
|
return convert_backend_index(backend1) < convert_backend_index(backend2);
|
||||||
|
}
|
||||||
dev_mgr()
|
dev_mgr()
|
||||||
{
|
{
|
||||||
sycl::device default_device =
|
sycl::device default_device =
|
||||||
sycl::device(sycl::default_selector_v);
|
sycl::device(sycl::default_selector_v);
|
||||||
_devs.push_back(std::make_shared<device_ext>(default_device));
|
_devs.push_back(std::make_shared<device_ext>(default_device));
|
||||||
|
|
||||||
std::vector<sycl::device> sycl_all_devs =
|
std::vector<sycl::device> sycl_all_devs;
|
||||||
sycl::device::get_devices(sycl::info::device_type::all);
|
|
||||||
// Collect other devices except for the default device.
|
// Collect other devices except for the default device.
|
||||||
if (default_device.is_cpu())
|
if (default_device.is_cpu())
|
||||||
_cpu_device = 0;
|
_cpu_device = 0;
|
||||||
|
|
||||||
|
auto Platforms = sycl::platform::get_platforms();
|
||||||
|
// Keep track of the number of devices per backend
|
||||||
|
std::map<sycl::backend, size_t> DeviceNums;
|
||||||
|
std::map<std::string, std::vector<sycl::device>> backend_devices;
|
||||||
|
|
||||||
|
while (!Platforms.empty()) {
|
||||||
|
auto Platform = Platforms.back();
|
||||||
|
Platforms.pop_back();
|
||||||
|
auto devices = Platform.get_devices();
|
||||||
|
std::string backend_type = get_device_backend_and_type(devices[0]);
|
||||||
|
for (const auto &device : devices) {
|
||||||
|
backend_devices[backend_type].push_back(device);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> keys;
|
||||||
|
for(auto it = backend_devices.begin(); it != backend_devices.end(); ++it) {
|
||||||
|
keys.push_back(it->first);
|
||||||
|
}
|
||||||
|
std::sort(keys.begin(), keys.end(), compare_backend);
|
||||||
|
|
||||||
|
for (auto &key : keys) {
|
||||||
|
std::vector<sycl::device> devs = backend_devices[key];
|
||||||
|
std::sort(devs.begin(), devs.end(), compare_dev);
|
||||||
|
for (const auto &dev : devs) {
|
||||||
|
sycl_all_devs.push_back(dev);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (auto &dev : sycl_all_devs)
|
for (auto &dev : sycl_all_devs)
|
||||||
{
|
{
|
||||||
if (dev == default_device)
|
if (dev == default_device)
|
||||||
|
@ -3202,6 +3274,11 @@ static int g_work_group_size = 0;
|
||||||
#define GGML_SYCL_MMV_Y 1
|
#define GGML_SYCL_MMV_Y 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
enum ggml_sycl_backend_gpu_mode {
|
||||||
|
SYCL_UNSET_GPU_MODE = -1,
|
||||||
|
SYCL_SINGLE_GPU_MODE = 0,
|
||||||
|
SYCL_MUL_GPU_MODE
|
||||||
|
};
|
||||||
|
|
||||||
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
||||||
|
|
||||||
|
@ -3401,12 +3478,31 @@ class sycl_gpu_mgr {
|
||||||
int work_group_size = 0;
|
int work_group_size = 0;
|
||||||
std::string gpus_list = "";
|
std::string gpus_list = "";
|
||||||
|
|
||||||
|
/*
|
||||||
|
Use all GPUs with same top max compute units
|
||||||
|
*/
|
||||||
sycl_gpu_mgr() {
|
sycl_gpu_mgr() {
|
||||||
detect_sycl_gpu_list_with_max_cu();
|
detect_sycl_gpu_list_with_max_cu();
|
||||||
get_allow_gpus();
|
get_allow_gpus();
|
||||||
create_context_with_gpus();
|
create_context_with_gpus();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Only use the assigned GPU
|
||||||
|
*/
|
||||||
|
sycl_gpu_mgr(int main_gpu_id) {
|
||||||
|
sycl::device device = dpct::dev_mgr::instance().get_device(main_gpu_id);
|
||||||
|
dpct::device_info prop;
|
||||||
|
dpct::get_device_info(prop, device);
|
||||||
|
gpus.push_back(main_gpu_id);
|
||||||
|
devices.push_back(device);
|
||||||
|
work_group_size = prop.get_max_work_group_size();
|
||||||
|
max_compute_units = prop.get_max_compute_units();
|
||||||
|
|
||||||
|
get_allow_gpus();
|
||||||
|
create_context_with_gpus();
|
||||||
|
}
|
||||||
|
|
||||||
void create_context_with_gpus() {
|
void create_context_with_gpus() {
|
||||||
sycl::context ctx = sycl::context(devices);
|
sycl::context ctx = sycl::context(devices);
|
||||||
assert(gpus.size() > 0);
|
assert(gpus.size() > 0);
|
||||||
|
@ -3422,7 +3518,7 @@ class sycl_gpu_mgr {
|
||||||
gpus_list += std::to_string(gpus[i]);
|
gpus_list += std::to_string(gpus[i]);
|
||||||
gpus_list += ",";
|
gpus_list += ",";
|
||||||
}
|
}
|
||||||
if (gpus_list.length() > 2) {
|
if (gpus_list.length() > 1) {
|
||||||
gpus_list.pop_back();
|
gpus_list.pop_back();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3451,7 +3547,7 @@ class sycl_gpu_mgr {
|
||||||
dpct::device_info prop;
|
dpct::device_info prop;
|
||||||
dpct::get_device_info(prop, device);
|
dpct::get_device_info(prop, device);
|
||||||
if (max_compute_units == prop.get_max_compute_units() &&
|
if (max_compute_units == prop.get_max_compute_units() &&
|
||||||
prop.get_major_version() == 1) {
|
is_ext_oneapi_device(device)) {
|
||||||
gpus.push_back(id);
|
gpus.push_back(id);
|
||||||
devices.push_back(device);
|
devices.push_back(device);
|
||||||
work_group_size = prop.get_max_work_group_size();
|
work_group_size = prop.get_max_work_group_size();
|
||||||
|
@ -3471,8 +3567,8 @@ class sycl_gpu_mgr {
|
||||||
if (gpus[i] == id)
|
if (gpus[i] == id)
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
assert(false);
|
printf("miss to get device index by id=%d\n", id);
|
||||||
return -1;
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_next_index(int id) {
|
int get_next_index(int id) {
|
||||||
|
@ -3481,8 +3577,16 @@ class sycl_gpu_mgr {
|
||||||
if (gpus[i] == id)
|
if (gpus[i] == id)
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
assert(false);
|
GGML_ASSERT(false);
|
||||||
return -1;
|
}
|
||||||
|
|
||||||
|
bool is_ext_oneapi_device(const sycl::device &dev) {
|
||||||
|
sycl::backend dev_backend = dev.get_backend();
|
||||||
|
if (dev_backend == sycl::backend::ext_oneapi_level_zero ||
|
||||||
|
dev_backend == sycl::backend::ext_oneapi_cuda ||
|
||||||
|
dev_backend == sycl::backend::ext_oneapi_hip)
|
||||||
|
return true;
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -3491,11 +3595,14 @@ static int g_device_count = -1;
|
||||||
static int g_all_sycl_device_count = -1;
|
static int g_all_sycl_device_count = -1;
|
||||||
static int g_main_device = -1;
|
static int g_main_device = -1;
|
||||||
static int g_main_device_id = -1;
|
static int g_main_device_id = -1;
|
||||||
|
static bool g_ggml_backend_sycl_buffer_type_initialized = false;
|
||||||
|
|
||||||
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
|
static std::array<float, GGML_SYCL_MAX_DEVICES> g_default_tensor_split = {};
|
||||||
|
|
||||||
static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
|
static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
|
||||||
|
|
||||||
|
static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode = SYCL_UNSET_GPU_MODE;
|
||||||
|
|
||||||
struct sycl_device_capabilities {
|
struct sycl_device_capabilities {
|
||||||
int cc; // compute capability
|
int cc; // compute capability
|
||||||
bool vmm; // virtual memory support
|
bool vmm; // virtual memory support
|
||||||
|
@ -12999,17 +13106,20 @@ bool ggml_sycl_loaded(void) {
|
||||||
return g_sycl_loaded;
|
return g_sycl_loaded;
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_device_detail(int id) {
|
void print_device_detail(int id, sycl::device &device, std::string device_type) {
|
||||||
|
|
||||||
dpct::device_info prop;
|
dpct::device_info prop;
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(
|
SYCL_CHECK(CHECK_TRY_ERROR(
|
||||||
dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id))));
|
dpct::get_device_info(prop, device)));
|
||||||
sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
|
|
||||||
std::string version;
|
std::string version;
|
||||||
version += std::to_string(prop.get_major_version());
|
version += std::to_string(prop.get_major_version());
|
||||||
version += ".";
|
version += ".";
|
||||||
version += std::to_string(prop.get_minor_version());
|
version += std::to_string(prop.get_minor_version());
|
||||||
|
|
||||||
fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id,
|
device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), "");
|
||||||
|
|
||||||
|
fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(),
|
||||||
prop.get_name(), version.c_str(), prop.get_max_compute_units(),
|
prop.get_name(), version.c_str(), prop.get_max_compute_units(),
|
||||||
prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
|
prop.get_max_work_group_size(), prop.get_max_sub_group_size(),
|
||||||
prop.get_global_mem_size());
|
prop.get_global_mem_size());
|
||||||
|
@ -13017,16 +13127,32 @@ void print_device_detail(int id) {
|
||||||
|
|
||||||
void ggml_backend_sycl_print_sycl_devices() {
|
void ggml_backend_sycl_print_sycl_devices() {
|
||||||
int device_count = dpct::dev_mgr::instance().device_count();
|
int device_count = dpct::dev_mgr::instance().device_count();
|
||||||
|
std::map<std::string, size_t> DeviceNums;
|
||||||
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
fprintf(stderr, "found %d SYCL devices:\n", device_count);
|
||||||
fprintf(stderr, "|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n");
|
fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n");
|
||||||
fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n");
|
fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n");
|
||||||
|
fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n");
|
||||||
for (int id = 0; id < device_count; ++id) {
|
for (int id = 0; id < device_count; ++id) {
|
||||||
print_device_detail(id);
|
sycl::device device = dpct::dev_mgr::instance().get_device(id);
|
||||||
|
sycl::backend backend = device.get_backend();
|
||||||
|
std::string backend_type = get_device_backend_and_type(device);
|
||||||
|
int type_id=DeviceNums[backend_type]++;
|
||||||
|
std::stringstream device_type;
|
||||||
|
device_type << "[" << backend_type << ":" << std::to_string(type_id) << "]";
|
||||||
|
print_device_detail(id, device, device_type.str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_gpu_device_list() {
|
void print_gpu_device_list() {
|
||||||
fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n",
|
GGML_ASSERT(g_sycl_gpu_mgr);
|
||||||
|
|
||||||
|
char* hint=NULL;
|
||||||
|
if (g_ggml_sycl_backend_gpu_mode == SYCL_SINGLE_GPU_MODE) {
|
||||||
|
hint = "use %d SYCL GPUs: [%s] with Max compute units:%d\n";
|
||||||
|
} else {
|
||||||
|
hint = "detect %d SYCL GPUs: [%s] with top Max compute units:%d\n";
|
||||||
|
}
|
||||||
|
fprintf(stderr, hint,
|
||||||
g_sycl_gpu_mgr->get_gpu_count(),
|
g_sycl_gpu_mgr->get_gpu_count(),
|
||||||
g_sycl_gpu_mgr->gpus_list.c_str(),
|
g_sycl_gpu_mgr->gpus_list.c_str(),
|
||||||
g_sycl_gpu_mgr->max_compute_units);
|
g_sycl_gpu_mgr->max_compute_units);
|
||||||
|
@ -13065,23 +13191,6 @@ void ggml_init_sycl() try {
|
||||||
#else
|
#else
|
||||||
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
|
||||||
dpct::dev_mgr::instance().device_count()) != 0) {
|
|
||||||
initialized = true;
|
|
||||||
g_sycl_loaded = false;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
|
||||||
ggml_backend_sycl_print_sycl_devices();
|
|
||||||
|
|
||||||
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
|
||||||
|
|
||||||
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
|
|
||||||
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
|
||||||
|
|
||||||
print_gpu_device_list();
|
|
||||||
|
|
||||||
int64_t total_vram = 0;
|
|
||||||
|
|
||||||
/* NOT REMOVE, keep it for next optimize for XMX.
|
/* NOT REMOVE, keep it for next optimize for XMX.
|
||||||
#if defined(SYCL_USE_XMX)
|
#if defined(SYCL_USE_XMX)
|
||||||
|
@ -13090,6 +13199,33 @@ void ggml_init_sycl() try {
|
||||||
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
if (CHECK_TRY_ERROR(g_all_sycl_device_count =
|
||||||
|
dpct::dev_mgr::instance().device_count()) != 0) {
|
||||||
|
initialized = true;
|
||||||
|
g_sycl_loaded = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
|
||||||
|
ggml_backend_sycl_print_sycl_devices();
|
||||||
|
initialized = true;
|
||||||
|
g_sycl_loaded = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (sycl::exception const &exc) {
|
||||||
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
|
<< ", line:" << __LINE__ << std::endl;
|
||||||
|
std::exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_init_by_gpus(int device_count) try {
|
||||||
|
g_device_count = device_count;
|
||||||
|
g_work_group_size = g_sycl_gpu_mgr->work_group_size;
|
||||||
|
|
||||||
|
int64_t total_vram = 0;
|
||||||
|
|
||||||
|
print_gpu_device_list();
|
||||||
|
|
||||||
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
|
||||||
g_device_caps[id].vmm = 0;
|
g_device_caps[id].vmm = 0;
|
||||||
g_device_caps[id].device_id = -1;
|
g_device_caps[id].device_id = -1;
|
||||||
|
@ -13132,10 +13268,6 @@ void ggml_init_sycl() try {
|
||||||
// create sycl handle
|
// create sycl handle
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
|
SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
initialized = true;
|
|
||||||
g_sycl_loaded = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
catch (sycl::exception const &exc) {
|
catch (sycl::exception const &exc) {
|
||||||
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
std::cerr << exc.what() << "Exception caught at file:" << __FILE__
|
||||||
|
@ -16542,22 +16674,24 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
|
||||||
/* .is_host = */ nullptr,
|
/* .is_host = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
|
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) {
|
||||||
|
if (device_index>=g_device_count or device_index<0) {
|
||||||
|
printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n",
|
||||||
|
device_index, g_device_count-1);
|
||||||
|
GGML_ASSERT(device_index<g_device_count);
|
||||||
|
}
|
||||||
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
|
static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
|
||||||
|
|
||||||
static bool ggml_backend_sycl_buffer_type_initialized = false;
|
if (!g_ggml_backend_sycl_buffer_type_initialized) {
|
||||||
|
|
||||||
if (!ggml_backend_sycl_buffer_type_initialized) {
|
|
||||||
for (int i = 0; i < g_device_count; i++) {
|
for (int i = 0; i < g_device_count; i++) {
|
||||||
ggml_backend_sycl_buffer_types[i] = {
|
ggml_backend_sycl_buffer_types[i] = {
|
||||||
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
|
||||||
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
|
/* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
ggml_backend_sycl_buffer_type_initialized = true;
|
g_ggml_backend_sycl_buffer_type_initialized = true;
|
||||||
}
|
}
|
||||||
|
return &ggml_backend_sycl_buffer_types[device_index];
|
||||||
return &ggml_backend_sycl_buffer_types[device];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sycl split buffer type
|
// sycl split buffer type
|
||||||
|
@ -17256,6 +17390,7 @@ static ggml_backend_i ggml_backend_sycl_interface = {
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
/* .supports_op = */ ggml_backend_sycl_supports_op,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
|
@ -17310,11 +17445,42 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) {
|
||||||
return g_sycl_gpu_mgr->get_index(device_id);
|
return g_sycl_gpu_mgr->get_index(device_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) {
|
||||||
|
return g_sycl_gpu_mgr->gpus[device_index];
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {
|
||||||
|
GGML_ASSERT(main_gpu_id<g_all_sycl_device_count);
|
||||||
|
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
|
||||||
|
if (g_sycl_gpu_mgr) {
|
||||||
|
delete g_sycl_gpu_mgr;
|
||||||
|
}
|
||||||
|
g_sycl_gpu_mgr = new sycl_gpu_mgr(main_gpu_id);
|
||||||
|
g_ggml_sycl_backend_gpu_mode = SYCL_SINGLE_GPU_MODE;
|
||||||
|
ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
|
||||||
|
g_ggml_backend_sycl_buffer_type_initialized = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode() {
|
||||||
|
if (g_ggml_sycl_backend_gpu_mode == SYCL_MUL_GPU_MODE) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fprintf(stderr, "ggml_backend_sycl_set_mul_device_mode: true\n");
|
||||||
|
|
||||||
|
if (g_sycl_gpu_mgr) {
|
||||||
|
delete g_sycl_gpu_mgr;
|
||||||
|
}
|
||||||
|
g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
||||||
|
g_ggml_sycl_backend_gpu_mode = SYCL_MUL_GPU_MODE;
|
||||||
|
ggml_init_by_gpus(g_sycl_gpu_mgr->get_gpu_count());
|
||||||
|
g_ggml_backend_sycl_buffer_type_initialized = false;
|
||||||
|
}
|
||||||
|
|
||||||
extern "C" int ggml_backend_sycl_reg_devices();
|
extern "C" int ggml_backend_sycl_reg_devices();
|
||||||
|
|
||||||
int ggml_backend_sycl_reg_devices() {
|
int ggml_backend_sycl_reg_devices() {
|
||||||
if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr();
|
ggml_backend_sycl_set_mul_device_mode();
|
||||||
g_device_count = g_sycl_gpu_mgr->get_gpu_count();
|
|
||||||
assert(g_device_count>0);
|
assert(g_device_count>0);
|
||||||
for (int i = 0; i < g_device_count; i++) {
|
for (int i = 0; i < g_device_count; i++) {
|
||||||
int id = g_sycl_gpu_mgr->gpus[i];
|
int id = g_sycl_gpu_mgr->gpus[i];
|
||||||
|
|
|
@ -29,6 +29,11 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_typ
|
||||||
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
|
||||||
|
|
||||||
|
// TODO: these are temporary
|
||||||
|
// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
|
||||||
|
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
|
||||||
|
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
|
||||||
|
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -710,6 +710,12 @@ static uint32_t ggml_vk_find_queue_family_index(std::vector<vk::QueueFamilyPrope
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// All commands that are allowed on a queue that supports transfer operations are also allowed on a queue that supports either graphics or compute operations.
|
||||||
|
// Thus, if the capabilities of a queue family include VK_QUEUE_GRAPHICS_BIT or VK_QUEUE_COMPUTE_BIT, then reporting the VK_QUEUE_TRANSFER_BIT capability separately for that queue family is optional.
|
||||||
|
if (compute_index >= 0) {
|
||||||
|
return compute_index;
|
||||||
|
}
|
||||||
|
|
||||||
std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
|
std::cerr << "ggml_vulkan: No suitable queue family index found." << std::endl;
|
||||||
|
|
||||||
for(auto &q_family : queue_family_props) {
|
for(auto &q_family : queue_family_props) {
|
||||||
|
@ -5693,6 +5699,7 @@ static ggml_backend_i ggml_backend_vk_interface = {
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
/* .graph_compute = */ ggml_backend_vk_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_vk_supports_op,
|
/* .supports_op = */ ggml_backend_vk_supports_op,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
/* .event_new = */ NULL,
|
||||||
/* .event_free = */ NULL,
|
/* .event_free = */ NULL,
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
|
|
131
ggml.c
131
ggml.c
|
@ -282,8 +282,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
|
||||||
#else
|
#else
|
||||||
#include <cblas.h>
|
#include <cblas.h>
|
||||||
#endif
|
#endif
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
|
||||||
#include "ggml-cuda.h"
|
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
#elif defined(GGML_USE_CLBLAST)
|
||||||
#include "ggml-opencl.h"
|
#include "ggml-opencl.h"
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
@ -470,6 +468,19 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||||
.type_size = sizeof(int32_t),
|
.type_size = sizeof(int32_t),
|
||||||
.is_quantized = false,
|
.is_quantized = false,
|
||||||
},
|
},
|
||||||
|
[GGML_TYPE_I64] = {
|
||||||
|
.type_name = "i64",
|
||||||
|
.blck_size = 1,
|
||||||
|
.type_size = sizeof(int64_t),
|
||||||
|
.is_quantized = false,
|
||||||
|
},
|
||||||
|
[GGML_TYPE_F64] = {
|
||||||
|
.type_name = "f64",
|
||||||
|
.blck_size = 1,
|
||||||
|
.type_size = sizeof(double),
|
||||||
|
.is_quantized = false,
|
||||||
|
.nrows = 1,
|
||||||
|
},
|
||||||
[GGML_TYPE_F32] = {
|
[GGML_TYPE_F32] = {
|
||||||
.type_name = "f32",
|
.type_name = "f32",
|
||||||
.blck_size = 1,
|
.blck_size = 1,
|
||||||
|
@ -918,6 +929,101 @@ inline static float vaddvq_f32(float32x4_t v) {
|
||||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#elif defined(__AVX512F__)
|
||||||
|
|
||||||
|
#define GGML_SIMD
|
||||||
|
|
||||||
|
// F32 AVX512
|
||||||
|
|
||||||
|
#define GGML_F32_STEP 64
|
||||||
|
#define GGML_F32_EPR 16
|
||||||
|
|
||||||
|
#define GGML_F32x16 __m512
|
||||||
|
#define GGML_F32x16_ZERO _mm512_setzero_ps()
|
||||||
|
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
|
||||||
|
#define GGML_F32x16_LOAD _mm512_loadu_ps
|
||||||
|
#define GGML_F32x16_STORE _mm512_storeu_ps
|
||||||
|
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
|
||||||
|
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
||||||
|
#define GGML_F32x16_ADD _mm512_add_ps
|
||||||
|
#define GGML_F32x16_MUL _mm512_mul_ps
|
||||||
|
#define GGML_F32x16_REDUCE(res, x) \
|
||||||
|
do { \
|
||||||
|
int offset = GGML_F32_ARR >> 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
offset >>= 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
offset >>= 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
res = _mm512_reduce_add_ps(x[0]); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
// TODO: is this optimal ?
|
||||||
|
|
||||||
|
#define GGML_F32_VEC GGML_F32x16
|
||||||
|
#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
|
||||||
|
#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
|
||||||
|
#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
|
||||||
|
#define GGML_F32_VEC_STORE GGML_F32x16_STORE
|
||||||
|
#define GGML_F32_VEC_FMA GGML_F32x16_FMA
|
||||||
|
#define GGML_F32_VEC_ADD GGML_F32x16_ADD
|
||||||
|
#define GGML_F32_VEC_MUL GGML_F32x16_MUL
|
||||||
|
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
|
||||||
|
|
||||||
|
// F16 AVX512
|
||||||
|
|
||||||
|
// F16 AVX
|
||||||
|
|
||||||
|
#define GGML_F16_STEP 64
|
||||||
|
#define GGML_F16_EPR 16
|
||||||
|
|
||||||
|
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
||||||
|
|
||||||
|
#define GGML_F32Cx16 __m512
|
||||||
|
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
||||||
|
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
||||||
|
|
||||||
|
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
||||||
|
// so F16C guard isn't required
|
||||||
|
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x)))
|
||||||
|
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
||||||
|
|
||||||
|
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
||||||
|
#define GGML_F32Cx16_ADD _mm512_add_ps
|
||||||
|
#define GGML_F32Cx16_MUL _mm512_mul_ps
|
||||||
|
#define GGML_F32Cx16_REDUCE(res, x) \
|
||||||
|
do { \
|
||||||
|
int offset = GGML_F32_ARR >> 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
offset >>= 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
offset >>= 1; \
|
||||||
|
for (int i = 0; i < offset; ++i) { \
|
||||||
|
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||||
|
} \
|
||||||
|
res = _mm512_reduce_add_ps(x[0]); \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define GGML_F16_VEC GGML_F32Cx16
|
||||||
|
#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
|
||||||
|
#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
|
||||||
|
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
|
||||||
|
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
|
||||||
|
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
||||||
|
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
||||||
|
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
||||||
|
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
||||||
|
|
||||||
#elif defined(__AVX__)
|
#elif defined(__AVX__)
|
||||||
|
|
||||||
#define GGML_SIMD
|
#define GGML_SIMD
|
||||||
|
@ -2532,9 +2638,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
||||||
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS)
|
#if defined(GGML_USE_CLBLAST)
|
||||||
ggml_init_cublas();
|
|
||||||
#elif defined(GGML_USE_CLBLAST)
|
|
||||||
ggml_cl_init();
|
ggml_cl_init();
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
ggml_vk_init_cpu_assist();
|
ggml_vk_init_cpu_assist();
|
||||||
|
@ -10997,7 +11101,6 @@ static void ggml_compute_forward_out_prod_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
|
||||||
// TODO: #if defined(GGML_USE_CLBLAST)
|
// TODO: #if defined(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||||
|
@ -11197,7 +11300,6 @@ static void ggml_compute_forward_out_prod_q_f32(
|
||||||
// nb01 >= nb00 - src0 is not transposed
|
// nb01 >= nb00 - src0 is not transposed
|
||||||
// compute by src0 rows
|
// compute by src0 rows
|
||||||
|
|
||||||
// TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod
|
|
||||||
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT) {
|
if (params->type == GGML_TASK_TYPE_INIT) {
|
||||||
|
@ -12418,6 +12520,8 @@ static void ggml_compute_forward_alibi(
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
|
case GGML_TYPE_I64:
|
||||||
|
case GGML_TYPE_F64:
|
||||||
case GGML_TYPE_COUNT:
|
case GGML_TYPE_COUNT:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -12504,6 +12608,8 @@ static void ggml_compute_forward_clamp(
|
||||||
case GGML_TYPE_I8:
|
case GGML_TYPE_I8:
|
||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
|
case GGML_TYPE_I64:
|
||||||
|
case GGML_TYPE_F64:
|
||||||
case GGML_TYPE_COUNT:
|
case GGML_TYPE_COUNT:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
@ -15939,14 +16045,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#if defined(GGML_USE_VULKAN)
|
||||||
bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
|
||||||
if (skip_cpu) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
|
||||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
|
||||||
#ifdef GGML_VULKAN_CHECK_RESULTS
|
#ifdef GGML_VULKAN_CHECK_RESULTS
|
||||||
if (skip_cpu) {
|
if (skip_cpu) {
|
||||||
|
@ -15958,7 +16057,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
||||||
}
|
}
|
||||||
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
|
||||||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
|
||||||
#endif // GGML_USE_CUBLAS
|
#endif // GGML_USE_VULKAN
|
||||||
|
|
||||||
#ifdef GGML_USE_SYCL
|
#ifdef GGML_USE_SYCL
|
||||||
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -366,6 +366,8 @@ extern "C" {
|
||||||
GGML_TYPE_I8 = 24,
|
GGML_TYPE_I8 = 24,
|
||||||
GGML_TYPE_I16 = 25,
|
GGML_TYPE_I16 = 25,
|
||||||
GGML_TYPE_I32 = 26,
|
GGML_TYPE_I32 = 26,
|
||||||
|
GGML_TYPE_I64 = 27,
|
||||||
|
GGML_TYPE_F64 = 28,
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -42,6 +42,7 @@ class Keys:
|
||||||
EXPERT_COUNT = "{arch}.expert_count"
|
EXPERT_COUNT = "{arch}.expert_count"
|
||||||
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
||||||
POOLING_TYPE = "{arch}.pooling_type"
|
POOLING_TYPE = "{arch}.pooling_type"
|
||||||
|
LOGIT_SCALE = "{arch}.logit_scale"
|
||||||
|
|
||||||
class Attention:
|
class Attention:
|
||||||
HEAD_COUNT = "{arch}.attention.head_count"
|
HEAD_COUNT = "{arch}.attention.head_count"
|
||||||
|
@ -121,6 +122,7 @@ class MODEL_ARCH(IntEnum):
|
||||||
GEMMA = auto()
|
GEMMA = auto()
|
||||||
STARCODER2 = auto()
|
STARCODER2 = auto()
|
||||||
MAMBA = auto()
|
MAMBA = auto()
|
||||||
|
COMMAND_R = auto()
|
||||||
|
|
||||||
|
|
||||||
class MODEL_TENSOR(IntEnum):
|
class MODEL_TENSOR(IntEnum):
|
||||||
|
@ -187,6 +189,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||||
MODEL_ARCH.GEMMA: "gemma",
|
MODEL_ARCH.GEMMA: "gemma",
|
||||||
MODEL_ARCH.STARCODER2: "starcoder2",
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
||||||
MODEL_ARCH.MAMBA: "mamba",
|
MODEL_ARCH.MAMBA: "mamba",
|
||||||
|
MODEL_ARCH.COMMAND_R: "command-r",
|
||||||
}
|
}
|
||||||
|
|
||||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||||
|
@ -579,6 +582,18 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||||
MODEL_TENSOR.SSM_D,
|
MODEL_TENSOR.SSM_D,
|
||||||
MODEL_TENSOR.SSM_OUT,
|
MODEL_TENSOR.SSM_OUT,
|
||||||
],
|
],
|
||||||
|
MODEL_ARCH.COMMAND_R: [
|
||||||
|
MODEL_TENSOR.TOKEN_EMBD,
|
||||||
|
MODEL_TENSOR.OUTPUT_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_NORM,
|
||||||
|
MODEL_TENSOR.ATTN_Q,
|
||||||
|
MODEL_TENSOR.ATTN_K,
|
||||||
|
MODEL_TENSOR.ATTN_V,
|
||||||
|
MODEL_TENSOR.ATTN_OUT,
|
||||||
|
MODEL_TENSOR.FFN_GATE,
|
||||||
|
MODEL_TENSOR.FFN_DOWN,
|
||||||
|
MODEL_TENSOR.FFN_UP,
|
||||||
|
],
|
||||||
# TODO
|
# TODO
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -665,6 +680,8 @@ class GGMLQuantizationType(IntEnum):
|
||||||
I8 = 24
|
I8 = 24
|
||||||
I16 = 25
|
I16 = 25
|
||||||
I32 = 26
|
I32 = 26
|
||||||
|
I64 = 27
|
||||||
|
F64 = 28
|
||||||
|
|
||||||
|
|
||||||
class GGUFEndian(IntEnum):
|
class GGUFEndian(IntEnum):
|
||||||
|
@ -734,6 +751,8 @@ GGML_QUANT_SIZES = {
|
||||||
GGMLQuantizationType.I8: (1, 1),
|
GGMLQuantizationType.I8: (1, 1),
|
||||||
GGMLQuantizationType.I16: (1, 2),
|
GGMLQuantizationType.I16: (1, 2),
|
||||||
GGMLQuantizationType.I32: (1, 4),
|
GGMLQuantizationType.I32: (1, 4),
|
||||||
|
GGMLQuantizationType.I64: (1, 8),
|
||||||
|
GGMLQuantizationType.F64: (1, 8),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -242,12 +242,15 @@ class GGUFReader:
|
||||||
n_bytes = n_elems * type_size // block_size
|
n_bytes = n_elems * type_size // block_size
|
||||||
data_offs = int(start_offs + offset_tensor[0])
|
data_offs = int(start_offs + offset_tensor[0])
|
||||||
item_type: npt.DTypeLike
|
item_type: npt.DTypeLike
|
||||||
if ggml_type == GGMLQuantizationType.F32:
|
if ggml_type == GGMLQuantizationType.F16:
|
||||||
item_count = n_elems
|
|
||||||
item_type = np.float32
|
|
||||||
elif ggml_type == GGMLQuantizationType.F16:
|
|
||||||
item_count = n_elems
|
item_count = n_elems
|
||||||
item_type = np.float16
|
item_type = np.float16
|
||||||
|
elif ggml_type == GGMLQuantizationType.F32:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.float32
|
||||||
|
elif ggml_type == GGMLQuantizationType.F64:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.float64
|
||||||
elif ggml_type == GGMLQuantizationType.I8:
|
elif ggml_type == GGMLQuantizationType.I8:
|
||||||
item_count = n_elems
|
item_count = n_elems
|
||||||
item_type = np.int8
|
item_type = np.int8
|
||||||
|
@ -257,6 +260,9 @@ class GGUFReader:
|
||||||
elif ggml_type == GGMLQuantizationType.I32:
|
elif ggml_type == GGMLQuantizationType.I32:
|
||||||
item_count = n_elems
|
item_count = n_elems
|
||||||
item_type = np.int32
|
item_type = np.int32
|
||||||
|
elif ggml_type == GGMLQuantizationType.I64:
|
||||||
|
item_count = n_elems
|
||||||
|
item_type = np.int64
|
||||||
else:
|
else:
|
||||||
item_count = n_bytes
|
item_count = n_bytes
|
||||||
item_type = np.uint8
|
item_type = np.uint8
|
||||||
|
|
|
@ -204,18 +204,22 @@ class GGUFWriter:
|
||||||
for i in range(n_dims):
|
for i in range(n_dims):
|
||||||
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
|
self.ti_data += self._pack("Q", tensor_shape[n_dims - 1 - i])
|
||||||
if raw_dtype is None:
|
if raw_dtype is None:
|
||||||
if tensor_dtype == np.float32:
|
if tensor_dtype == np.float16:
|
||||||
dtype = GGMLQuantizationType.F32
|
|
||||||
elif tensor_dtype == np.float16:
|
|
||||||
dtype = GGMLQuantizationType.F16
|
dtype = GGMLQuantizationType.F16
|
||||||
|
elif tensor_dtype == np.float32:
|
||||||
|
dtype = GGMLQuantizationType.F32
|
||||||
|
elif tensor_dtype == np.float64:
|
||||||
|
dtype = GGMLQuantizationType.F64
|
||||||
elif tensor_dtype == np.int8:
|
elif tensor_dtype == np.int8:
|
||||||
dtype = GGMLQuantizationType.I8
|
dtype = GGMLQuantizationType.I8
|
||||||
elif tensor_dtype == np.int16:
|
elif tensor_dtype == np.int16:
|
||||||
dtype = GGMLQuantizationType.I16
|
dtype = GGMLQuantizationType.I16
|
||||||
elif tensor_dtype == np.int32:
|
elif tensor_dtype == np.int32:
|
||||||
dtype = GGMLQuantizationType.I32
|
dtype = GGMLQuantizationType.I32
|
||||||
|
elif tensor_dtype == np.int64:
|
||||||
|
dtype = GGMLQuantizationType.I64
|
||||||
else:
|
else:
|
||||||
raise ValueError("Only F32, F16, I8, I16, I32 tensors are supported for now")
|
raise ValueError("Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now")
|
||||||
else:
|
else:
|
||||||
dtype = raw_dtype
|
dtype = raw_dtype
|
||||||
self.ti_data += self._pack("I", dtype)
|
self.ti_data += self._pack("I", dtype)
|
||||||
|
@ -357,6 +361,9 @@ class GGUFWriter:
|
||||||
def add_clamp_kqv(self, value: float) -> None:
|
def add_clamp_kqv(self, value: float) -> None:
|
||||||
self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
|
self.add_float32(Keys.Attention.CLAMP_KQV.format(arch=self.arch), value)
|
||||||
|
|
||||||
|
def add_logit_scale(self, value: float) -> None:
|
||||||
|
self.add_float32(Keys.LLM.LOGIT_SCALE.format(arch=self.arch), value)
|
||||||
|
|
||||||
def add_expert_count(self, count: int) -> None:
|
def add_expert_count(self, count: int) -> None:
|
||||||
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
|
self.add_uint32(Keys.LLM.EXPERT_COUNT.format(arch=self.arch), count)
|
||||||
|
|
||||||
|
|
402
llama.cpp
402
llama.cpp
|
@ -214,6 +214,7 @@ enum llm_arch {
|
||||||
LLM_ARCH_GEMMA,
|
LLM_ARCH_GEMMA,
|
||||||
LLM_ARCH_STARCODER2,
|
LLM_ARCH_STARCODER2,
|
||||||
LLM_ARCH_MAMBA,
|
LLM_ARCH_MAMBA,
|
||||||
|
LLM_ARCH_COMMAND_R,
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -243,6 +244,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||||
{ LLM_ARCH_GEMMA, "gemma" },
|
{ LLM_ARCH_GEMMA, "gemma" },
|
||||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||||
{ LLM_ARCH_MAMBA, "mamba" },
|
{ LLM_ARCH_MAMBA, "mamba" },
|
||||||
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
||||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -268,6 +270,7 @@ enum llm_kv {
|
||||||
LLM_KV_EXPERT_COUNT,
|
LLM_KV_EXPERT_COUNT,
|
||||||
LLM_KV_EXPERT_USED_COUNT,
|
LLM_KV_EXPERT_USED_COUNT,
|
||||||
LLM_KV_POOLING_TYPE,
|
LLM_KV_POOLING_TYPE,
|
||||||
|
LLM_KV_LOGIT_SCALE,
|
||||||
|
|
||||||
LLM_KV_ATTENTION_HEAD_COUNT,
|
LLM_KV_ATTENTION_HEAD_COUNT,
|
||||||
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
LLM_KV_ATTENTION_HEAD_COUNT_KV,
|
||||||
|
@ -332,6 +335,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
|
||||||
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
|
||||||
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
|
||||||
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
|
||||||
|
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
|
||||||
|
|
||||||
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
|
||||||
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
|
||||||
|
@ -536,6 +540,7 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{
|
{
|
||||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_OUTPUT, "output"},
|
||||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||||
|
@ -838,6 +843,21 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
|
||||||
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
LLM_ARCH_COMMAND_R,
|
||||||
|
{
|
||||||
|
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||||
|
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||||
|
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||||
|
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||||
|
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||||
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||||
|
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||||
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||||
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
LLM_ARCH_UNKNOWN,
|
LLM_ARCH_UNKNOWN,
|
||||||
{
|
{
|
||||||
|
@ -1608,6 +1628,7 @@ enum e_model {
|
||||||
MODEL_20B,
|
MODEL_20B,
|
||||||
MODEL_30B,
|
MODEL_30B,
|
||||||
MODEL_34B,
|
MODEL_34B,
|
||||||
|
MODEL_35B,
|
||||||
MODEL_40B,
|
MODEL_40B,
|
||||||
MODEL_65B,
|
MODEL_65B,
|
||||||
MODEL_70B,
|
MODEL_70B,
|
||||||
|
@ -1654,6 +1675,7 @@ struct llama_hparams {
|
||||||
|
|
||||||
float f_clamp_kqv = 0.0f;
|
float f_clamp_kqv = 0.0f;
|
||||||
float f_max_alibi_bias = 0.0f;
|
float f_max_alibi_bias = 0.0f;
|
||||||
|
float f_logit_scale = 0.0f;
|
||||||
|
|
||||||
bool causal_attn = true;
|
bool causal_attn = true;
|
||||||
bool need_kq_pos = false;
|
bool need_kq_pos = false;
|
||||||
|
@ -1885,6 +1907,31 @@ struct llama_kv_cache {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct llama_control_vector {
|
||||||
|
std::vector<struct ggml_tensor *> tensors; // per layer
|
||||||
|
std::vector<struct ggml_context *> ctxs;
|
||||||
|
std::vector<ggml_backend_buffer_t> bufs;
|
||||||
|
|
||||||
|
int32_t layer_start = -1;
|
||||||
|
int32_t layer_end = -1;
|
||||||
|
|
||||||
|
ggml_tensor * tensor_for(int il) const {
|
||||||
|
if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return tensors[il];
|
||||||
|
}
|
||||||
|
|
||||||
|
~llama_control_vector() {
|
||||||
|
for (struct ggml_context * ctx : ctxs) {
|
||||||
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
for (ggml_backend_buffer_t buf : bufs) {
|
||||||
|
ggml_backend_buffer_free(buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_vocab {
|
struct llama_vocab {
|
||||||
using id = int32_t;
|
using id = int32_t;
|
||||||
using token = std::string;
|
using token = std::string;
|
||||||
|
@ -2010,6 +2057,11 @@ struct llama_model {
|
||||||
ggml_free(ctx);
|
ggml_free(ctx);
|
||||||
}
|
}
|
||||||
for (ggml_backend_buffer_t buf : bufs) {
|
for (ggml_backend_buffer_t buf : bufs) {
|
||||||
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) {
|
||||||
|
ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
ggml_backend_buffer_free(buf);
|
ggml_backend_buffer_free(buf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2104,6 +2156,8 @@ struct llama_context {
|
||||||
struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
|
struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch]
|
||||||
|
|
||||||
|
|
||||||
|
// control vectors
|
||||||
|
struct llama_control_vector cvec;
|
||||||
};
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -3245,6 +3299,7 @@ static const char * llama_model_type_name(e_model type) {
|
||||||
case MODEL_20B: return "20B";
|
case MODEL_20B: return "20B";
|
||||||
case MODEL_30B: return "30B";
|
case MODEL_30B: return "30B";
|
||||||
case MODEL_34B: return "34B";
|
case MODEL_34B: return "34B";
|
||||||
|
case MODEL_35B: return "35B";
|
||||||
case MODEL_40B: return "40B";
|
case MODEL_40B: return "40B";
|
||||||
case MODEL_65B: return "65B";
|
case MODEL_65B: return "65B";
|
||||||
case MODEL_70B: return "70B";
|
case MODEL_70B: return "70B";
|
||||||
|
@ -3642,6 +3697,15 @@ static void llm_load_hparams(
|
||||||
default: model.type = e_model::MODEL_UNKNOWN;
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_COMMAND_R:
|
||||||
|
{
|
||||||
|
ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||||
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
switch (hparams.n_layer) {
|
||||||
|
case 40: model.type = e_model::MODEL_35B; break;
|
||||||
|
default: model.type = e_model::MODEL_UNKNOWN;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default: (void)0;
|
default: (void)0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3963,6 +4027,7 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
||||||
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps);
|
||||||
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv);
|
||||||
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias);
|
||||||
|
LLAMA_LOG_INFO("%s: f_logit_scale = %.1e\n", __func__, hparams.f_logit_scale);
|
||||||
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
||||||
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
||||||
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
||||||
|
@ -4281,9 +4346,9 @@ static bool llm_load_tensors(
|
||||||
{
|
{
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
||||||
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) {
|
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
} else {
|
if (!model.output) {
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||||
ml.n_created--; // artificial tensor
|
ml.n_created--; // artificial tensor
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
ml.size_data += ggml_nbytes(model.output);
|
||||||
|
@ -4488,11 +4553,13 @@ static bool llm_load_tensors(
|
||||||
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false);
|
||||||
|
|
||||||
// same as tok_embd, duplicated to allow offloading
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
|
||||||
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
if (!model.output) {
|
||||||
|
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU
|
||||||
ml.n_created--; // artificial tensor
|
ml.n_created--; // artificial tensor
|
||||||
ml.size_data += ggml_nbytes(model.output);
|
ml.size_data += ggml_nbytes(model.output);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
ggml_context * ctx_layer = ctx_for_layer(i);
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
@ -4964,6 +5031,37 @@ static bool llm_load_tensors(
|
||||||
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_COMMAND_R:
|
||||||
|
{
|
||||||
|
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
|
||||||
|
// output
|
||||||
|
{
|
||||||
|
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
||||||
|
// init output from the input tok embed
|
||||||
|
model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
||||||
|
ml.n_created--; // artificial tensor
|
||||||
|
ml.size_data += ggml_nbytes(model.output);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
ggml_context * ctx_layer = ctx_for_layer(i);
|
||||||
|
ggml_context * ctx_split = ctx_for_layer_split(i);
|
||||||
|
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
||||||
|
|
||||||
|
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
||||||
|
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
||||||
|
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
||||||
|
|
||||||
|
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
||||||
|
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
||||||
|
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
||||||
|
}
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("unknown architecture");
|
throw std::runtime_error("unknown architecture");
|
||||||
}
|
}
|
||||||
|
@ -4988,8 +5086,13 @@ static bool llm_load_tensors(
|
||||||
size_t first, last;
|
size_t first, last;
|
||||||
ml.get_mapping_range(&first, &last, ctx);
|
ml.get_mapping_range(&first, &last, ctx);
|
||||||
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
|
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
|
||||||
#ifdef GGML_USE_MPI
|
|
||||||
buf = ggml_backend_mpi_wrap_buffer(buf);
|
#ifdef GGML_USE_CUBLAS
|
||||||
|
if (n_layer >= n_gpu_layers) {
|
||||||
|
ggml_backend_cuda_register_host_buffer(
|
||||||
|
ggml_backend_buffer_get_base(buf),
|
||||||
|
ggml_backend_buffer_get_size(buf));
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#ifdef GGML_USE_METAL
|
#ifdef GGML_USE_METAL
|
||||||
|
@ -5013,6 +5116,10 @@ static bool llm_load_tensors(
|
||||||
if (buf == nullptr) {
|
if (buf == nullptr) {
|
||||||
throw std::runtime_error("failed to allocate buffer");
|
throw std::runtime_error("failed to allocate buffer");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_MPI
|
||||||
|
buf = ggml_backend_mpi_wrap_buffer(buf);
|
||||||
|
#endif
|
||||||
// indicate that this buffer contains weights
|
// indicate that this buffer contains weights
|
||||||
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
||||||
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
||||||
|
@ -5114,6 +5221,16 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_SYCL
|
||||||
|
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
|
||||||
|
ggml_backend_sycl_set_single_device_mode(params.main_gpu);
|
||||||
|
//SYCL use device index (0, 1, 2) directly, uer input device id, then convert to device index.
|
||||||
|
params.main_gpu = ggml_backend_sycl_get_device_index(params.main_gpu);
|
||||||
|
} else {
|
||||||
|
ggml_backend_sycl_set_mul_device_mode();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (!llm_load_tensors(
|
if (!llm_load_tensors(
|
||||||
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.node_layer_weights, params.use_mlock,
|
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.node_layer_weights, params.use_mlock,
|
||||||
params.progress_callback, params.progress_callback_user_data
|
params.progress_callback, params.progress_callback_user_data
|
||||||
|
@ -5909,6 +6026,12 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
|
||||||
|
ggml_tensor * layer_dir = lctx.cvec.tensor_for(il);
|
||||||
|
if (layer_dir != nullptr) {
|
||||||
|
cur = ggml_add(ctx0, cur, layer_dir);
|
||||||
|
}
|
||||||
cb(cur, "l_out", il);
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
// input for next layer
|
// input for next layer
|
||||||
|
@ -5944,7 +6067,7 @@ struct llm_build_context {
|
||||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
|
||||||
// inp_pos - contains the positions
|
// inp_pos - contains the positions
|
||||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
|
||||||
|
|
||||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
@ -5995,7 +6118,6 @@ struct llm_build_context {
|
||||||
cb(Qcur, "Qcur", il);
|
cb(Qcur, "Qcur", il);
|
||||||
cb(Kcur, "Kcur", il);
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, NULL,
|
model.layers[il].wo, NULL,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
@ -8183,7 +8305,6 @@ struct llm_build_context {
|
||||||
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
model.layers[il].wo, model.layers[il].bo,
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
cb(cur, "kqv_out", il);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||||
|
@ -8363,6 +8484,121 @@ struct llm_build_context {
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct ggml_cgraph * build_command_r() {
|
||||||
|
|
||||||
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
||||||
|
|
||||||
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||||
|
const float f_logit_scale = hparams.f_logit_scale;
|
||||||
|
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * inpL;
|
||||||
|
|
||||||
|
inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
|
||||||
|
|
||||||
|
// inp_pos - contains the positions
|
||||||
|
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||||
|
|
||||||
|
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||||
|
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
|
||||||
|
// norm
|
||||||
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
||||||
|
model.layers[il].attn_norm, NULL,
|
||||||
|
LLM_NORM, cb, il);
|
||||||
|
cb(cur, "attn_norm", il);
|
||||||
|
struct ggml_tensor * ffn_inp = cur;
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
// compute Q and K and RoPE them
|
||||||
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
if (model.layers[il].bq) {
|
||||||
|
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
if (model.layers[il].bk) {
|
||||||
|
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
if (model.layers[il].bv) {
|
||||||
|
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
||||||
|
cb(Vcur, "Vcur", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
Qcur = ggml_rope_custom(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
||||||
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Qcur, "Qcur", il);
|
||||||
|
|
||||||
|
Kcur = ggml_rope_custom(
|
||||||
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
||||||
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
ext_factor, attn_factor, beta_fast, beta_slow
|
||||||
|
);
|
||||||
|
cb(Kcur, "Kcur", il);
|
||||||
|
|
||||||
|
cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
|
||||||
|
model.layers[il].wo, model.layers[il].bo,
|
||||||
|
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * attn_out = cur;
|
||||||
|
|
||||||
|
// feed-forward network
|
||||||
|
{
|
||||||
|
cur = llm_build_ffn(ctx0, ffn_inp,
|
||||||
|
model.layers[il].ffn_up, NULL,
|
||||||
|
model.layers[il].ffn_gate, NULL,
|
||||||
|
model.layers[il].ffn_down, NULL,
|
||||||
|
NULL,
|
||||||
|
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||||
|
cb(cur, "ffn_out", il);
|
||||||
|
}
|
||||||
|
|
||||||
|
// add together residual + FFN + self-attention
|
||||||
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
cur = ggml_add(ctx0, cur, attn_out);
|
||||||
|
cb(cur, "l_out", il);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = inpL;
|
||||||
|
|
||||||
|
cur = llm_build_norm(ctx0, cur, hparams,
|
||||||
|
model.output_norm, NULL,
|
||||||
|
LLM_NORM, cb, -1);
|
||||||
|
cb(cur, "result_norm", -1);
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
cur = ggml_mul_mat(ctx0, model.output, cur);
|
||||||
|
|
||||||
|
if (f_logit_scale) {
|
||||||
|
cur = ggml_scale(ctx0, cur, f_logit_scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
cb(cur, "result_output", -1);
|
||||||
|
|
||||||
|
ggml_build_forward_expand(gf, cur);
|
||||||
|
|
||||||
|
return gf;
|
||||||
|
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
||||||
|
@ -8438,7 +8674,9 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
}
|
}
|
||||||
|
|
||||||
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
|
||||||
// to fix this, we assign the norm layer manually to the backend of its layer
|
// FIXME: fix in ggml_backend_sched
|
||||||
|
const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer;
|
||||||
|
if (batch.n_tokens < 32 || full_offload) {
|
||||||
if (il != -1 && strcmp(name, "norm") == 0) {
|
if (il != -1 && strcmp(name, "norm") == 0) {
|
||||||
for (auto * backend : lctx.backends) {
|
for (auto * backend : lctx.backends) {
|
||||||
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) {
|
||||||
|
@ -8447,6 +8685,7 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_cgraph * result = NULL;
|
struct ggml_cgraph * result = NULL;
|
||||||
|
@ -8545,6 +8784,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||||
{
|
{
|
||||||
result = llm.build_mamba();
|
result = llm.build_mamba();
|
||||||
} break;
|
} break;
|
||||||
|
case LLM_ARCH_COMMAND_R:
|
||||||
|
{
|
||||||
|
result = llm.build_command_r();
|
||||||
|
} break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
@ -12958,9 +13201,8 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->backends.push_back(ctx->backend_metal);
|
ctx->backends.push_back(ctx->backend_metal);
|
||||||
}
|
}
|
||||||
#elif defined(GGML_USE_CUBLAS)
|
#elif defined(GGML_USE_CUBLAS)
|
||||||
if (model->n_gpu_layers > 0) {
|
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||||
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
||||||
|
@ -12980,7 +13222,6 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
|
||||||
|
@ -12997,23 +13238,22 @@ struct llama_context * llama_new_context_with_model(
|
||||||
if (model->n_gpu_layers > 0) {
|
if (model->n_gpu_layers > 0) {
|
||||||
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
||||||
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
||||||
int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu);
|
ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
|
||||||
ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index);
|
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index);
|
int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu);
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
ctx->backends.push_back(backend);
|
ctx->backends.push_back(backend);
|
||||||
} else {
|
} else {
|
||||||
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
||||||
int id_list[GGML_SYCL_MAX_DEVICES];
|
|
||||||
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
|
||||||
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) {
|
||||||
int device_id = id_list[i];
|
|
||||||
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
ggml_backend_t backend = ggml_backend_sycl_init(i);
|
||||||
if (backend == nullptr) {
|
if (backend == nullptr) {
|
||||||
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i);
|
int id_list[GGML_SYCL_MAX_DEVICES];
|
||||||
|
ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES);
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -13156,14 +13396,17 @@ struct llama_context * llama_new_context_with_model(
|
||||||
ggml_backend_t backend = ctx->backends[i];
|
ggml_backend_t backend = ctx->backends[i];
|
||||||
ggml_backend_buffer_type_t buft = backend_buft[i];
|
ggml_backend_buffer_type_t buft = backend_buft[i];
|
||||||
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend);
|
||||||
|
if (size > 1) {
|
||||||
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
||||||
ggml_backend_buft_name(buft),
|
ggml_backend_buft_name(buft),
|
||||||
size / 1024.0 / 1024.0);
|
size / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// note: the number of splits during measure is higher than during inference due to the kv shift
|
// note: the number of splits during measure is higher than during inference due to the kv shift
|
||||||
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
||||||
LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits);
|
LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes);
|
||||||
|
LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13225,6 +13468,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||||
case LLM_ARCH_ORION:
|
case LLM_ARCH_ORION:
|
||||||
case LLM_ARCH_INTERNLM2:
|
case LLM_ARCH_INTERNLM2:
|
||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
|
case LLM_ARCH_COMMAND_R:
|
||||||
return LLAMA_ROPE_TYPE_NORM;
|
return LLAMA_ROPE_TYPE_NORM;
|
||||||
|
|
||||||
// the pairs of head values are offset by n_rot/2
|
// the pairs of head values are offset by n_rot/2
|
||||||
|
@ -13261,6 +13505,10 @@ int32_t llama_n_embd(const struct llama_model * model) {
|
||||||
return model->hparams.n_embd;
|
return model->hparams.n_embd;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int32_t llama_n_layer(const struct llama_model * model) {
|
||||||
|
return model->hparams.n_layer;
|
||||||
|
}
|
||||||
|
|
||||||
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
float llama_rope_freq_scale_train(const struct llama_model * model) {
|
||||||
return model->hparams.rope_freq_scale_train;
|
return model->hparams.rope_freq_scale_train;
|
||||||
}
|
}
|
||||||
|
@ -13360,6 +13608,96 @@ int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool llama_control_vector_init(struct llama_control_vector & cvec, const llama_model & model) {
|
||||||
|
GGML_ASSERT(cvec.tensors.empty());
|
||||||
|
GGML_ASSERT(cvec.ctxs.empty());
|
||||||
|
GGML_ASSERT(cvec.bufs.empty());
|
||||||
|
|
||||||
|
// count layer buffer types
|
||||||
|
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
||||||
|
for (int64_t i = 0; i < model.hparams.n_layer; i++) {
|
||||||
|
buft_layer_count[model.buft_layer[i].buft]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate contexts
|
||||||
|
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||||
|
for (auto & it : buft_layer_count) {
|
||||||
|
int n_layers = it.second;
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
/*.mem_size =*/ n_layers * ggml_tensor_overhead(),
|
||||||
|
/*.mem_buffer =*/ NULL,
|
||||||
|
/*.no_alloc =*/ true,
|
||||||
|
};
|
||||||
|
ggml_context * ctx = ggml_init(params);
|
||||||
|
if (!ctx) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
ctx_map[it.first] = ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
// make tensors
|
||||||
|
cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
|
||||||
|
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
||||||
|
struct ggml_context * ctx = ctx_map.at(model.buft_layer[il].buft);
|
||||||
|
ggml_tensor * tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, model.hparams.n_embd);
|
||||||
|
cvec.tensors.push_back(tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate tensors / buffers and zero
|
||||||
|
for (auto it : ctx_map) {
|
||||||
|
ggml_backend_buffer_type_t buft = it.first;
|
||||||
|
ggml_context * ctx = it.second;
|
||||||
|
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
||||||
|
if (!buf) {
|
||||||
|
LLAMA_LOG_ERROR("%s: failed to allocate buffer for control vector\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
ggml_backend_buffer_clear(buf, 0);
|
||||||
|
cvec.ctxs.push_back(ctx);
|
||||||
|
cvec.bufs.push_back(buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t llama_control_vector_apply(struct llama_context * lctx, const float * data, size_t len, int32_t n_embd, int32_t il_start, int32_t il_end) {
|
||||||
|
const llama_model & model = lctx->model;
|
||||||
|
llama_control_vector & cvec = lctx->cvec;
|
||||||
|
|
||||||
|
if (data == nullptr) {
|
||||||
|
// disable the current control vector (but leave allocated for later)
|
||||||
|
cvec.layer_start = -1;
|
||||||
|
cvec.layer_end = -1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_embd != (int) model.hparams.n_embd) {
|
||||||
|
LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cvec.tensors.empty()) {
|
||||||
|
if (!llama_control_vector_init(cvec, model)) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cvec.layer_start = il_start;
|
||||||
|
cvec.layer_end = il_end;
|
||||||
|
|
||||||
|
for (size_t il = 1; il < model.hparams.n_layer; il++) {
|
||||||
|
assert(cvec.tensors[il] != nullptr);
|
||||||
|
|
||||||
|
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present
|
||||||
|
if (off + n_embd <= len) {
|
||||||
|
ggml_backend_tensor_set(cvec.tensors[il], data + off, 0, n_embd * ggml_element_size(cvec.tensors[il]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
|
struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max) {
|
||||||
struct llama_kv_cache_view result = {
|
struct llama_kv_cache_view result = {
|
||||||
/*.n_cells = */ 0,
|
/*.n_cells = */ 0,
|
||||||
|
@ -14347,6 +14685,26 @@ static int32_t llama_chat_apply_template_internal(
|
||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<start_of_turn>model\n";
|
ss << "<start_of_turn>model\n";
|
||||||
}
|
}
|
||||||
|
} else if (tmpl == "orion" || tmpl.find("'\\n\\nAssistant: ' + eos_token") != std::string::npos) {
|
||||||
|
// OrionStarAI/Orion-14B-Chat
|
||||||
|
std::string system_prompt = "";
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
if (role == "system") {
|
||||||
|
// there is no system message support, we will merge it with user prompt
|
||||||
|
system_prompt = message->content;
|
||||||
|
continue;
|
||||||
|
} else if (role == "user") {
|
||||||
|
ss << "Human: ";
|
||||||
|
if (!system_prompt.empty()) {
|
||||||
|
ss << system_prompt << "\n\n";
|
||||||
|
system_prompt = "";
|
||||||
|
}
|
||||||
|
ss << message->content << "\n\nAssistant: </s>";
|
||||||
|
} else {
|
||||||
|
ss << message->content << "</s>";
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// template not supported
|
// template not supported
|
||||||
return -1;
|
return -1;
|
||||||
|
|
15
llama.h
15
llama.h
|
@ -395,6 +395,7 @@ extern "C" {
|
||||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||||
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
|
||||||
|
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
|
||||||
|
|
||||||
// Get the model's RoPE frequency scaling factor
|
// Get the model's RoPE frequency scaling factor
|
||||||
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||||
|
@ -447,6 +448,20 @@ extern "C" {
|
||||||
const char * path_base_model,
|
const char * path_base_model,
|
||||||
int32_t n_threads);
|
int32_t n_threads);
|
||||||
|
|
||||||
|
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
|
||||||
|
// the currently loaded vector.
|
||||||
|
// n_embd should be the size of a single layer's control, and data should point
|
||||||
|
// to an n_embd x n_layers buffer starting from layer 1.
|
||||||
|
// il_start and il_end are the layer range the vector should apply to (both inclusive)
|
||||||
|
// See llama_control_vector_load in common to load a control vector.
|
||||||
|
LLAMA_API int32_t llama_control_vector_apply(
|
||||||
|
struct llama_context * lctx,
|
||||||
|
const float * data,
|
||||||
|
size_t len,
|
||||||
|
int32_t n_embd,
|
||||||
|
int32_t il_start,
|
||||||
|
int32_t il_end);
|
||||||
|
|
||||||
//
|
//
|
||||||
// KV cache
|
// KV cache
|
||||||
//
|
//
|
||||||
|
|
|
@ -31,6 +31,8 @@ int main(void) {
|
||||||
"{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
|
"{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
|
||||||
// google/gemma-7b-it
|
// google/gemma-7b-it
|
||||||
"{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
|
"{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
|
||||||
|
// OrionStarAI/Orion-14B-Chat
|
||||||
|
"{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
|
||||||
};
|
};
|
||||||
std::vector<std::string> expected_output = {
|
std::vector<std::string> expected_output = {
|
||||||
// teknium/OpenHermes-2.5-Mistral-7B
|
// teknium/OpenHermes-2.5-Mistral-7B
|
||||||
|
@ -45,6 +47,8 @@ int main(void) {
|
||||||
"system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n I am an assistant </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
|
"system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n I am an assistant </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
|
||||||
// google/gemma-7b-it
|
// google/gemma-7b-it
|
||||||
"<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
|
"<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
|
||||||
|
// OrionStarAI/Orion-14B-Chat
|
||||||
|
"Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s> I am an assistant </s>Human: Another question\n\nAssistant: </s>",
|
||||||
};
|
};
|
||||||
std::vector<char> formatted_chat(1024);
|
std::vector<char> formatted_chat(1024);
|
||||||
int32_t res;
|
int32_t res;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue