Merge branch 'master' into gg/llama-kv-cache

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-01-29 15:01:26 +02:00
commit c30e34cdba
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
20 changed files with 331 additions and 136 deletions

View file

@ -13,9 +13,13 @@ elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
exec ./llama-quantize "$@" exec ./llama-quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
exec ./llama-cli "$@" exec ./llama-cli "$@"
elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
exec ./llama-bench "$@"
elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
exec ./llama-perplexity "$@"
elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
echo "Converting PTH to GGML..." echo "Converting PTH to GGML..."
for i in `ls $1/$2/ggml-model-f16.bin*`; do for i in $(ls $1/$2/ggml-model-f16.bin*); do
if [ -f "${i/f16/q4_0}" ]; then if [ -f "${i/f16/q4_0}" ]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}" echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else else
@ -30,6 +34,10 @@ else
echo "Available commands: " echo "Available commands: "
echo " --run (-r): Run a model previously converted into ggml" echo " --run (-r): Run a model previously converted into ggml"
echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512" echo " ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
echo " --bench (-b): Benchmark the performance of the inference for various parameters."
echo " ex: -m model.gguf"
echo " --perplexity (-p): Measure the perplexity of a model over a given text."
echo " ex: -m model.gguf -f file.txt"
echo " --convert (-c): Convert a llama model into ggml" echo " --convert (-c): Convert a llama model into ggml"
echo " ex: --outtype f16 \"/models/7B/\" " echo " ex: --outtype f16 \"/models/7B/\" "
echo " --quantize (-q): Optimize with quantization process ggml" echo " --quantize (-q): Optimize with quantization process ggml"

View file

@ -1,4 +1,4 @@
ARG UBUNTU_VERSION=22.04 ARG UBUNTU_VERSION=24.04
FROM ubuntu:$UBUNTU_VERSION AS build FROM ubuntu:$UBUNTU_VERSION AS build
@ -7,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget
# Install Vulkan SDK and cURL # Install Vulkan SDK and cURL
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
apt update -y && \ apt update -y && \
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
@ -55,8 +55,9 @@ RUN apt-get update \
git \ git \
python3 \ python3 \
python3-pip \ python3-pip \
&& pip install --upgrade pip setuptools wheel \ python3-wheel \
&& pip install -r requirements.txt \ && pip install --break-system-packages --upgrade setuptools \
&& pip install --break-system-packages -r requirements.txt \
&& apt autoremove -y \ && apt autoremove -y \
&& apt clean -y \ && apt clean -y \
&& rm -rf /tmp/* /var/tmp/* \ && rm -rf /tmp/* /var/tmp/* \

View file

@ -28,7 +28,7 @@ jobs:
push_to_registry: push_to_registry:
name: Push Docker image to Docker Hub name: Push Docker image to Docker Hub
runs-on: ubuntu-latest runs-on: ubuntu-22.04
env: env:
COMMIT_SHA: ${{ github.sha }} COMMIT_SHA: ${{ github.sha }}
strategy: strategy:
@ -36,8 +36,7 @@ jobs:
matrix: matrix:
config: config:
# Multi-stage build # Multi-stage build
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
- { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/arm64", full: true, light: true, server: true, freediskspace: false}
- { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
- { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}

View file

@ -9,7 +9,7 @@ set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@") set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@") set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
find_package(ggml REQUIRED) find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
find_library(llama_LIBRARY llama find_library(llama_LIBRARY llama
REQUIRED REQUIRED

View file

@ -877,7 +877,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params) { [](common_params & params) {
params.warmup = false; params.warmup = false;
} }
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER})); ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_EMBEDDING}));
add_opt(common_arg( add_opt(common_arg(
{"--spm-infill"}, {"--spm-infill"},
string_format( string_format(

View file

@ -181,6 +181,10 @@ class Opt {
} }
} }
if (model_.empty()){
return 1;
}
return 0; return 0;
} }
@ -319,6 +323,10 @@ class HttpClient {
public: public:
int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file, int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
const bool progress, std::string * response_str = nullptr) { const bool progress, std::string * response_str = nullptr) {
if (std::filesystem::exists(output_file)) {
return 0;
}
std::string output_file_partial; std::string output_file_partial;
curl = curl_easy_init(); curl = curl_easy_init();
if (!curl) { if (!curl) {
@ -346,7 +354,11 @@ class HttpClient {
data.file_size = set_resume_point(output_file_partial); data.file_size = set_resume_point(output_file_partial);
set_progress_options(progress, data); set_progress_options(progress, data);
set_headers(headers); set_headers(headers);
perform(url); CURLcode res = perform(url);
if (res != CURLE_OK){
printe("Fetching resource '%s' failed: %s\n", url.c_str(), curl_easy_strerror(res));
return 1;
}
if (!output_file.empty()) { if (!output_file.empty()) {
std::filesystem::rename(output_file_partial, output_file); std::filesystem::rename(output_file_partial, output_file);
} }
@ -411,16 +423,12 @@ class HttpClient {
} }
} }
void perform(const std::string & url) { CURLcode perform(const std::string & url) {
CURLcode res;
curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https"); curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https");
curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L); curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
res = curl_easy_perform(curl); return curl_easy_perform(curl);
if (res != CURLE_OK) {
printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
}
} }
static std::string human_readable_time(double seconds) { static std::string human_readable_time(double seconds) {
@ -558,13 +566,14 @@ class LlamaData {
} }
sampler = initialize_sampler(opt); sampler = initialize_sampler(opt);
return 0; return 0;
} }
private: private:
#ifdef LLAMA_USE_CURL #ifdef LLAMA_USE_CURL
int download(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file, int download(const std::string & url, const std::string & output_file, const bool progress,
const bool progress, std::string * response_str = nullptr) { const std::vector<std::string> & headers = {}, std::string * response_str = nullptr) {
HttpClient http; HttpClient http;
if (http.init(url, headers, output_file, progress, response_str)) { if (http.init(url, headers, output_file, progress, response_str)) {
return 1; return 1;
@ -573,47 +582,84 @@ class LlamaData {
return 0; return 0;
} }
#else #else
int download(const std::string &, const std::vector<std::string> &, const std::string &, const bool, int download(const std::string &, const std::string &, const bool, const std::vector<std::string> & = {},
std::string * = nullptr) { std::string * = nullptr) {
printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
return 1; return 1;
} }
#endif #endif
int huggingface_dl(const std::string & model, const std::vector<std::string> headers, const std::string & bn) { // Helper function to handle model tag extraction and URL construction
// Find the second occurrence of '/' after protocol string std::pair<std::string, std::string> extract_model_and_tag(std::string & model, const std::string & base_url) {
size_t pos = model.find('/');
pos = model.find('/', pos + 1);
if (pos == std::string::npos) {
return 1;
}
const std::string hfr = model.substr(0, pos);
const std::string hff = model.substr(pos + 1);
const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
return download(url, headers, bn, true);
}
int ollama_dl(std::string & model, const std::vector<std::string> headers, const std::string & bn) {
if (model.find('/') == std::string::npos) {
model = "library/" + model;
}
std::string model_tag = "latest"; std::string model_tag = "latest";
size_t colon_pos = model.find(':'); const size_t colon_pos = model.find(':');
if (colon_pos != std::string::npos) { if (colon_pos != std::string::npos) {
model_tag = model.substr(colon_pos + 1); model_tag = model.substr(colon_pos + 1);
model = model.substr(0, colon_pos); model = model.substr(0, colon_pos);
} }
std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag; std::string url = base_url + model + "/manifests/" + model_tag;
return { model, url };
}
// Helper function to download and parse the manifest
int download_and_parse_manifest(const std::string & url, const std::vector<std::string> & headers,
nlohmann::json & manifest) {
std::string manifest_str; std::string manifest_str;
const int ret = download(manifest_url, headers, "", false, &manifest_str); int ret = download(url, "", false, headers, &manifest_str);
if (ret) {
return ret;
}
manifest = nlohmann::json::parse(manifest_str);
return 0;
}
int huggingface_dl(std::string & model, const std::string & bn) {
// Find the second occurrence of '/' after protocol string
size_t pos = model.find('/');
pos = model.find('/', pos + 1);
std::string hfr, hff;
std::vector<std::string> headers = { "User-Agent: llama-cpp", "Accept: application/json" };
std::string url;
if (pos == std::string::npos) {
auto [model_name, manifest_url] = extract_model_and_tag(model, "https://huggingface.co/v2/");
hfr = model_name;
nlohmann::json manifest;
int ret = download_and_parse_manifest(manifest_url, headers, manifest);
if (ret) {
return ret;
}
hff = manifest["ggufFile"]["rfilename"];
} else {
hfr = model.substr(0, pos);
hff = model.substr(pos + 1);
}
url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
return download(url, bn, true, headers);
}
int ollama_dl(std::string & model, const std::string & bn) {
const std::vector<std::string> headers = { "Accept: application/vnd.docker.distribution.manifest.v2+json" };
if (model.find('/') == std::string::npos) {
model = "library/" + model;
}
auto [model_name, manifest_url] = extract_model_and_tag(model, "https://registry.ollama.ai/v2/");
nlohmann::json manifest;
int ret = download_and_parse_manifest(manifest_url, {}, manifest);
if (ret) { if (ret) {
return ret; return ret;
} }
nlohmann::json manifest = nlohmann::json::parse(manifest_str);
std::string layer; std::string layer;
for (const auto & l : manifest["layers"]) { for (const auto & l : manifest["layers"]) {
if (l["mediaType"] == "application/vnd.ollama.image.model") { if (l["mediaType"] == "application/vnd.ollama.image.model") {
@ -622,8 +668,34 @@ class LlamaData {
} }
} }
std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer; std::string blob_url = "https://registry.ollama.ai/v2/" + model_name + "/blobs/" + layer;
return download(blob_url, headers, bn, true);
return download(blob_url, bn, true, headers);
}
int github_dl(const std::string & model, const std::string & bn) {
std::string repository = model;
std::string branch = "main";
const size_t at_pos = model.find('@');
if (at_pos != std::string::npos) {
repository = model.substr(0, at_pos);
branch = model.substr(at_pos + 1);
}
const std::vector<std::string> repo_parts = string_split(repository, "/");
if (repo_parts.size() < 3) {
printe("Invalid GitHub repository format\n");
return 1;
}
const std::string & org = repo_parts[0];
const std::string & project = repo_parts[1];
std::string url = "https://raw.githubusercontent.com/" + org + "/" + project + "/" + branch;
for (size_t i = 2; i < repo_parts.size(); ++i) {
url += "/" + repo_parts[i];
}
return download(url, bn, true);
} }
std::string basename(const std::string & path) { std::string basename(const std::string & path) {
@ -654,21 +726,22 @@ class LlamaData {
} }
const std::string bn = basename(model_); const std::string bn = basename(model_);
const std::vector<std::string> headers = { "--header", if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://") ||
"Accept: application/vnd.docker.distribution.manifest.v2+json" }; string_starts_with(model_, "hf.co/")) {
if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
rm_until_substring(model_, "://");
ret = huggingface_dl(model_, headers, bn);
} else if (string_starts_with(model_, "hf.co/")) {
rm_until_substring(model_, "hf.co/"); rm_until_substring(model_, "hf.co/");
ret = huggingface_dl(model_, headers, bn);
} else if (string_starts_with(model_, "ollama://")) {
rm_until_substring(model_, "://"); rm_until_substring(model_, "://");
ret = ollama_dl(model_, headers, bn); ret = huggingface_dl(model_, bn);
} else if (string_starts_with(model_, "https://")) { } else if ((string_starts_with(model_, "https://") || string_starts_with(model_, "http://")) &&
ret = download(model_, headers, bn, true); !string_starts_with(model_, "https://ollama.com/library/")) {
} else { ret = download(model_, bn, true);
ret = ollama_dl(model_, headers, bn); } else if (string_starts_with(model_, "github:") || string_starts_with(model_, "github://")) {
rm_until_substring(model_, "github:");
rm_until_substring(model_, "://");
ret = github_dl(model_, bn);
} else { // ollama:// or nothing
rm_until_substring(model_, "ollama.com/library/");
rm_until_substring(model_, "://");
ret = ollama_dl(model_, bn);
} }
model_ = bn; model_ = bn;

View file

@ -87,7 +87,7 @@ def test_completion_stream_vs_non_stream():
assert content_stream == res_non_stream.body["content"] assert content_stream == res_non_stream.body["content"]
def test_completion_stream_with_openai_library(): def test_completion_with_openai_library():
global server global server
server.start() server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
@ -102,7 +102,7 @@ def test_completion_stream_with_openai_library():
assert match_regex("(going|bed)+", res.choices[0].text) assert match_regex("(going|bed)+", res.choices[0].text)
def test_completion_with_openai_library(): def test_completion_stream_with_openai_library():
global server global server
server.start() server.start()
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")

View file

@ -308,7 +308,7 @@ if (GGML_CPU_ALL_VARIANTS)
# MSVC doesn't support AMX # MSVC doesn't support AMX
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8) ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
endif() endif()
else () elseif (GGML_CPU)
ggml_add_cpu_backend_variant_impl("") ggml_add_cpu_backend_variant_impl("")
endif() endif()

View file

@ -1302,7 +1302,7 @@ struct ggml_threadpool {
// these are atomic as an annotation for thread-sanitizer // these are atomic as an annotation for thread-sanitizer
atomic_bool stop; // Used for stopping the threadpool altogether atomic_bool stop; // Used for stopping the threadpool altogether
atomic_bool pause; // Used for pausing the threadpool or individual threads atomic_bool pause; // Used for pausing the threadpool or individual threads
atomic_bool abort; // Used for aborting processing of a graph atomic_int abort; // Used for aborting processing of a graph
struct ggml_compute_state * workers; // per thread state struct ggml_compute_state * workers; // per thread state
int n_threads_max; // number of threads in the pool int n_threads_max; // number of threads in the pool
@ -13851,14 +13851,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
/*.threadpool=*/ tp, /*.threadpool=*/ tp,
}; };
for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) { for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
struct ggml_tensor * node = cgraph->nodes[node_n]; struct ggml_tensor * node = cgraph->nodes[node_n];
ggml_compute_forward(&params, node); ggml_compute_forward(&params, node);
if (state->ith == 0 && cplan->abort_callback && if (state->ith == 0 && cplan->abort_callback &&
cplan->abort_callback(cplan->abort_callback_data)) { cplan->abort_callback(cplan->abort_callback_data)) {
tp->abort = true; atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
tp->ec = GGML_STATUS_ABORTED; tp->ec = GGML_STATUS_ABORTED;
} }
@ -14031,7 +14031,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
threadpool->current_chunk = 0; threadpool->current_chunk = 0;
threadpool->stop = false; threadpool->stop = false;
threadpool->pause = tpp->paused; threadpool->pause = tpp->paused;
threadpool->abort = false; threadpool->abort = -1;
threadpool->workers = NULL; threadpool->workers = NULL;
threadpool->n_threads_max = tpp->n_threads; threadpool->n_threads_max = tpp->n_threads;
threadpool->n_threads_cur = tpp->n_threads; threadpool->n_threads_cur = tpp->n_threads;
@ -14110,7 +14110,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
threadpool->cgraph = cgraph; threadpool->cgraph = cgraph;
threadpool->cplan = cplan; threadpool->cplan = cplan;
threadpool->current_chunk = 0; threadpool->current_chunk = 0;
threadpool->abort = false; threadpool->abort = -1;
threadpool->ec = GGML_STATUS_SUCCESS; threadpool->ec = GGML_STATUS_SUCCESS;
} }

View file

@ -46,20 +46,20 @@
#define GGML_CUDA_CC_VOLTA 700 #define GGML_CUDA_CC_VOLTA 700
#define GGML_CUDA_CC_TURING 750 #define GGML_CUDA_CC_TURING 750
#define GGML_CUDA_CC_AMPERE 800 #define GGML_CUDA_CC_AMPERE 800
#define GGML_CUDA_CC_OFFSET_AMD 1000000 #define GGML_CUDA_CC_OFFSET_AMD 0x1000000
// GCN/CNDA, wave size is 64 // GCN/CNDA, wave size is 64
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16 #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue #define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a #define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers #define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing #define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300 #define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32 // RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000 #define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x1010) // RX 5000
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a #define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x1030) // RX 6000, minimum for dp4a
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA #define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x1100) // RX 7000, minimum for WMMA
#define GGML_CUDA_CC_QY1 210 #define GGML_CUDA_CC_QY1 210
#define GGML_CUDA_CC_QY2 220 #define GGML_CUDA_CC_QY2 220

View file

@ -42,6 +42,7 @@
#include <algorithm> #include <algorithm>
#include <array> #include <array>
#include <atomic> #include <atomic>
#include <charconv>
#include <cinttypes> #include <cinttypes>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
@ -119,12 +120,78 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
#endif #endif
} }
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
static int ggml_cuda_parse_id(char devName[]) {
// A list of possible Target IDs can be found under the rocclr/clr repo in device.cpp
// these values are not stable so this is susceptible to breakage
// https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/device.cpp
int archMajor = 0x0;
int archMinor = 0x0;
int archNum = GGML_CUDA_CC_OFFSET_AMD;
int archLen = strlen(devName);
char archName[archLen + 1];
// strip leading 'gfx' while copying into our buffer
if (archLen > 3) {
strcpy(archName, &devName[3]);
archLen -= 3;
}
// trim trailing :xnack- or :sramecc- statuses
archLen = strcspn(archName, ":");
archName[archLen] = '\0';
// tease out the version information
if (archLen > 8) {
// versions labeled generic use '-' as delimiter
// strip the trailing "-generic" then iterate through what remains
if ((strstr(archName, "-generic"))) {
archName[archLen - 8] = '\0';
char * pch;
if ((pch = strtok(archName, "-"))) {
archMajor = (int)strtoul(pch, 0, 16);
if ((pch = strtok(NULL, "-"))) {
archMinor = 0x10 * (int)strtoul(pch, 0, 16);
}
}
}
} else if (archLen >= 3) {
// last two digits should be the minor * 0x10 + stepping
archMinor = (int)strtoul(&archName[archLen - 2], 0, 16);
archName[archLen - 2] = '\0';
// only the major version remains
archMajor = (int)strtoul(archName, 0, 16);
}
archNum += archMajor * 0x100;
archNum += archMinor;
return archNum;
}
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
static ggml_cuda_device_info ggml_cuda_init() { static ggml_cuda_device_info ggml_cuda_init() {
#ifdef __HIP_PLATFORM_AMD__ #ifdef __HIP_PLATFORM_AMD__
// Workaround for a rocBLAS bug when using multiple graphics cards: // Workaround for a rocBLAS bug when using multiple graphics cards:
// https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
{
int major_version = 0;
size_t version_length = 0;
if (rocblas_get_version_string_size(&version_length) == rocblas_status_success) {
std::string version(version_length, '\0');
if (rocblas_get_version_string(version.data(), version.size()) == rocblas_status_success) {
version.resize(::strlen(version.c_str()));
int parsed_value = 0;
if (std::from_chars(version.c_str(), version.c_str() + version.length(), parsed_value).ec == std::errc()) {
major_version = parsed_value;
}
}
}
if (major_version < 4) {
GGML_LOG_DEBUG(GGML_CUDA_NAME " calling rocblas_initialize as a workaround for a rocBLAS bug\n");
rocblas_initialize(); rocblas_initialize();
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaDeviceSynchronize());
}
}
#endif #endif
ggml_cuda_device_info info = {}; ggml_cuda_device_info info = {};
@ -169,7 +236,6 @@ static ggml_cuda_device_info ggml_cuda_init() {
cudaDeviceProp prop; cudaDeviceProp prop;
CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
info.default_tensor_split[id] = total_vram; info.default_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem; total_vram += prop.totalGlobalMem;
@ -178,10 +244,25 @@ static ggml_cuda_device_info ggml_cuda_init() {
info.devices[id].smpb = prop.sharedMemPerBlock; info.devices[id].smpb = prop.sharedMemPerBlock;
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
info.devices[id].smpbo = prop.sharedMemPerBlock; info.devices[id].smpbo = prop.sharedMemPerBlock;
info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
info.devices[id].cc = ggml_cuda_parse_id(prop.gcnArchName);
if ((info.devices[id].cc & 0xff00) == 0x0) {
GGML_LOG_WARN("invalid architecture ID received for device %d %s: %s cc %d.%d\n",
id, prop.name, prop.gcnArchName, prop.major, prop.minor);
// Fallback to prop.major and prop.minor
if (prop.major > 0) {
info.devices[id].cc = GGML_CUDA_CC_OFFSET_AMD + prop.major * 0x100;
info.devices[id].cc += prop.minor * 0x10;
}
}
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s\n",
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff, device_vmm ? "yes" : "no");
#else #else
info.devices[id].smpbo = prop.sharedMemPerBlockOptin; info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
info.devices[id].cc = 100*prop.major + 10*prop.minor; info.devices[id].cc = 100*prop.major + 10*prop.minor;
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) #endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
} }

View file

@ -13,6 +13,12 @@ __device__ float __forceinline__ t2f32<half>(half val) {
return __half2float(val); return __half2float(val);
} }
// When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpass-failed"
#endif
template <bool use_shared, int ncols_template, int block_size_template, typename T> template <bool use_shared, int ncols_template, int block_size_template, typename T>
static __global__ void soft_max_f32( static __global__ void soft_max_f32(
const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y,
@ -118,6 +124,9 @@ static __global__ void soft_max_f32(
dst[col] = vals[col] * inv_sum; dst[col] = vals[col] * inv_sum;
} }
} }
#ifdef __clang__
#pragma clang diagnostic pop
#endif
static __global__ void soft_max_back_f32( static __global__ void soft_max_back_f32(
const float * grad, const float * dstf, float * dst, const int ncols, const float scale) { const float * grad, const float * dstf, float * dst, const int ncols, const float scale) {

View file

@ -3878,10 +3878,6 @@ static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor
ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_diag_mask_inf);
} }
static void ggml_sycl_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_soft_max);
}
static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented
ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rope); ggml_sycl_op_flatten(ctx, dst->src[0], dst->src[1], dst, ggml_sycl_op_rope);
@ -4090,7 +4086,7 @@ bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tens
ggml_sycl_diag_mask_inf(ctx, dst); ggml_sycl_diag_mask_inf(ctx, dst);
break; break;
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
ggml_sycl_soft_max(ctx, dst); ggml_sycl_op_soft_max(ctx, dst);
break; break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
ggml_sycl_rope(ctx, dst); ggml_sycl_rope(ctx, dst);

View file

@ -1,7 +1,7 @@
#include "norm.hpp" #include "softmax.hpp"
template <bool vals_smem, int ncols_template, int block_size_template> template <bool vals_smem, int ncols_template, int block_size_template, typename T>
static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par, static void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par,
const int nrows_y, const float scale, const float max_bias, const float m0, const int nrows_y, const float scale, const float max_bias, const float m0,
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) { const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
const int ncols = ncols_template == 0 ? ncols_par : ncols_template; const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
@ -29,7 +29,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
slope = sycl::pow(base, float(exp)); slope = sycl::pow(base, float(exp));
} }
float *vals = vals_smem ? buf + std::max(nwarps, WARP_SIZE) : dst + rowx * ncols; float *vals = vals_smem ? buf + sycl::max(nwarps, WARP_SIZE) : dst + rowx * ncols;
float max_val = -INFINITY; float max_val = -INFINITY;
for (int col0 = 0; col0 < ncols; col0 += block_size) { for (int col0 = 0; col0 < ncols; col0 += block_size) {
@ -42,7 +42,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
const int ix = rowx*ncols + col; const int ix = rowx*ncols + col;
const int iy = rowy*ncols + col; const int iy = rowy*ncols + col;
const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f); const float val = x[ix]*scale + (mask ? slope*static_cast<float>(mask[iy]) : 0.0f);
vals[col] = val; vals[col] = val;
max_val = sycl::max(max_val, val); max_val = sycl::max(max_val, val);
@ -65,7 +65,7 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
item_ct1.barrier(sycl::access::fence_space::local_space); item_ct1.barrier(sycl::access::fence_space::local_space);
max_val = buf[lane_id]; max_val = buf[lane_id];
for (size_t i = 1; i < nreduce; i += 1) { for (size_t i = 1; i < nreduce; i += 1) {
max_val = std::max(max_val, buf[lane_id + i * WARP_SIZE]); max_val = sycl::max(max_val, buf[lane_id + i * WARP_SIZE]);
} }
max_val = warp_reduce_max(max_val, item_ct1); max_val = warp_reduce_max(max_val, item_ct1);
} }
@ -122,8 +122,8 @@ static void soft_max_f32(const float * x, const float * mask, float * dst, const
} }
} }
template <bool vals_smem, int ncols_template, int block_size_template> template <bool vals_smem, int ncols_template, int block_size_template, typename T>
static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par, static void soft_max_f32_submitter(const float * x, const T * mask, float * dst, const int ncols_par,
const int nrows_y, const float scale, const float max_bias, const float m0, const int nrows_y, const float scale, const float max_bias, const float m0,
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims, const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
const size_t n_local_scratch, queue_ptr stream) { const size_t n_local_scratch, queue_ptr stream) {
@ -141,7 +141,8 @@ static void soft_max_f32_submitter(const float * x, const float * mask, float *
}); });
} }
static void soft_max_f32_sycl(const float * x, const float * mask, template<typename T>
static void soft_max_f32_sycl(const float * x, const T * mask,
float * dst, const int ncols_x, const int nrows_x, float * dst, const int ncols_x, const int nrows_x,
const int nrows_y, const float scale, const float max_bias, const int nrows_y, const float scale, const float max_bias,
queue_ptr stream, int device) { queue_ptr stream, int device) {
@ -223,22 +224,16 @@ static void soft_max_f32_sycl(const float * x, const float * mask,
} }
} }
void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, const ggml_tensor *src0, void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
const ggml_tensor *src1, ggml_tensor *dst,
const float *src0_dd, const float *src1_dd,
float *dst_dd,
const queue_ptr &main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support") GGML_ASSERT(!dst->src[1] || dst->src[1]->type == GGML_TYPE_F16 || dst->src[1]->type == GGML_TYPE_F32); // src1 contains mask and it is optional
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = dst->src[0]->ne[0];
const int64_t nrows_x = ggml_nrows(src0); const int64_t nrows_x = ggml_nrows(dst->src[0]);
const int64_t nrows_y = src0->ne[1]; const int64_t nrows_y = dst->src[0]->ne[1];
float scale = 1.0f; float scale = 1.0f;
float max_bias = 0.0f; float max_bias = 0.0f;
@ -246,6 +241,21 @@ void ggml_sycl_op_soft_max(ggml_backend_sycl_context & ctx, const ggml_tensor *s
memcpy(&scale, dst->op_params + 0, sizeof(float)); memcpy(&scale, dst->op_params + 0, sizeof(float));
memcpy(&max_bias, dst->op_params + 1, sizeof(float)); memcpy(&max_bias, dst->op_params + 1, sizeof(float));
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, const float * src0_dd = static_cast<const float *>(dst->src[0]->data);
nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device); float * dst_dd = static_cast<float *>(dst->data);
ggml_sycl_set_device(ctx.device);
dpct::queue_ptr main_stream = ctx.stream();
if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F16) {
const sycl::half * src1_dd = static_cast<sycl::half *>(dst->src[1]->data);
soft_max_f32_sycl<sycl::half>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias,
main_stream, ctx.device);
} else if (dst->src[1] && dst->src[1]->type == GGML_TYPE_F32) {
const float * src1_dd = static_cast<const float *>(dst->src[1]->data);
soft_max_f32_sycl<float>(src0_dd, src1_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
} else {
/* mask unavailable */
soft_max_f32_sycl<float>(src0_dd, nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream, ctx.device);
}
} }

View file

@ -15,10 +15,6 @@
#include "common.hpp" #include "common.hpp"
void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, const ggml_tensor *src0, void ggml_sycl_op_soft_max(ggml_backend_sycl_context &ctx, ggml_tensor *dst);
const ggml_tensor *src1, ggml_tensor *dst,
const float *src0_dd, const float *src1_dd,
float *dst_dd,
const queue_ptr &main_stream);
#endif // GGML_SYCL_SOFTMAX_HPP #endif // GGML_SYCL_SOFTMAX_HPP

View file

@ -128,6 +128,10 @@ static void ggml_print_backtrace_symbols(void) {
#endif #endif
static void ggml_print_backtrace(void) { static void ggml_print_backtrace(void) {
const char * GGML_NO_BACKTRACE = getenv("GGML_NO_BACKTRACE");
if (GGML_NO_BACKTRACE) {
return;
}
char attach[32]; char attach[32];
snprintf(attach, sizeof(attach), "attach %d", getpid()); snprintf(attach, sizeof(attach), "attach %d", getpid());
int pid = fork(); int pid = fork();

View file

@ -1 +1 @@
d92321c0d151fe73a47d89738c7c3091ac904297 32f0b85987396945afea2291d5f4c5862434292b

View file

@ -819,7 +819,7 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
for (const auto & file : files) { for (const auto & file : files) {
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
std::unique_ptr<llama_mmap> mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn())); std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
mmaps_used.emplace_back(mapping->size(), 0); mmaps_used.emplace_back(mapping->size(), 0);
if (mlock_mmaps) { if (mlock_mmaps) {
std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock()); std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());

View file

@ -1245,8 +1245,13 @@ struct llama_vocab::impl {
std::vector<llama_token> cache_special_tokens; std::vector<llama_token> cache_special_tokens;
std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true); std::vector<std::string> cache_token_to_piece; // llama_token_to_piece(special = true);
struct pair_hash {
std::map<std::pair<std::string, std::string>, int> bpe_ranks; size_t operator()(const std::pair<std::string, std::string> & p) const {
return std::hash<std::string>{}(p.first) ^ //create some hash for pair
(std::hash<std::string>{}(p.second) << 1);
}
};
std::unordered_map<std::pair<std::string, std::string>, int, pair_hash> bpe_ranks;
// set of all tokens that cause "end of generation" // set of all tokens that cause "end of generation"
std::set<llama_token> special_eog_ids; std::set<llama_token> special_eog_ids;

View file

@ -2347,11 +2347,12 @@ struct test_soft_max : public test_case {
const ggml_type type; const ggml_type type;
const std::array<int64_t, 4> ne; const std::array<int64_t, 4> ne;
const bool mask; const bool mask;
const ggml_type m_prec;
const float scale; const float scale;
const float max_bias; const float max_bias;
std::string vars() override { std::string vars() override {
return VARS_TO_STR5(type, ne, mask, scale, max_bias); return VARS_TO_STR6(type, ne, mask, m_prec, scale, max_bias);
} }
// the 1024 test with bias occasionally fails: // the 1024 test with bias occasionally fails:
@ -2363,9 +2364,10 @@ struct test_soft_max : public test_case {
test_soft_max(ggml_type type = GGML_TYPE_F32, test_soft_max(ggml_type type = GGML_TYPE_F32,
std::array<int64_t, 4> ne = {10, 5, 4, 3}, std::array<int64_t, 4> ne = {10, 5, 4, 3},
bool mask = false, bool mask = false,
ggml_type m_prec = GGML_TYPE_F32,
float scale = 1.0f, float scale = 1.0f,
float max_bias = 0.0f) float max_bias = 0.0f)
: type(type), ne(ne), mask(mask), scale(scale), max_bias(max_bias) {} : type(type), ne(ne), mask(mask), m_prec(m_prec), scale(scale), max_bias(max_bias) {}
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
@ -2374,7 +2376,7 @@ struct test_soft_max : public test_case {
ggml_tensor * mask = nullptr; ggml_tensor * mask = nullptr;
if (this->mask) { if (this->mask) {
mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]); mask = ggml_new_tensor_2d(ctx, m_prec, ne[0], ne[1]);
ggml_set_name(mask, "mask"); ggml_set_name(mask, "mask");
} }
@ -4150,17 +4152,28 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
for (float scale : {1.0f, 0.1f}) { for (float scale : {1.0f, 0.1f}) {
for (int64_t ne0 : {16, 1024}) { for (int64_t ne0 : {16, 1024}) {
for (int64_t ne1 : {16, 1024}) { for (int64_t ne1 : {16, 1024}) {
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias)); if (mask) {
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias)); for (ggml_type m_prec : {GGML_TYPE_F32, GGML_TYPE_F16}) {
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, m_prec, scale, max_bias));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, m_prec, scale, max_bias));
}
} else {
/* The precision of mask here doesn't matter as boolean mask is false */
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, GGML_TYPE_F32, scale, max_bias));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, GGML_TYPE_F32, scale, max_bias));
} }
} }
} }
} }
} }
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, 0.1f, 0.0f)); }
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, 0.1f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, GGML_TYPE_F32, 0.1f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, true, GGML_TYPE_F16, 0.1f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {16, 2, 32, 1}, false, GGML_TYPE_F32, 0.1f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F32, 0.1f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F16, 0.1f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F32, 0.1f, 8.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, GGML_TYPE_F16, 0.1f, 8.0f));
for (float max_bias : {0.0f, 8.0f}) { for (float max_bias : {0.0f, 8.0f}) {
for (float scale : {1.0f, 0.1f}) { for (float scale : {1.0f, 0.1f}) {
@ -4296,13 +4309,13 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {8192, 512, 2, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {3072, 512, 2, 1}, {0, 2, 1, 3}));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {4096, 4096, 5, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 4096, 5, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {1024, 1024, 10, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 1024, 10, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {256, 256, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {64, 64, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, 1.0f, 0.0f)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {77, 64, 20, 1}, false, GGML_TYPE_F32, 1.0f, 0.0f));
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1})); test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32, 10, 1, 1}));
test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1})); test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));