Merge branch '0cc4m/vulkan-subgroup-size-control' of https://github.com/ggerganov/llama.cpp into vulkan
This commit is contained in:
commit
8972f1d35c
12 changed files with 551 additions and 415 deletions
|
@ -1992,6 +1992,14 @@ class Qwen2Model(Model):
|
|||
except FileNotFoundError:
|
||||
self._set_vocab_gpt2()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
|
||||
if self.hparams["rope_scaling"].get("type") == "yarn":
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
|
||||
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
|
||||
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
|
||||
|
||||
|
||||
@Model.register("Qwen2MoeForCausalLM")
|
||||
class Qwen2MoeModel(Model):
|
||||
|
|
|
@ -687,12 +687,14 @@ This endpoint is public (no API key check). By default, it is read-only. To make
|
|||
}
|
||||
},
|
||||
"total_slots": 1,
|
||||
"model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf",
|
||||
"chat_template": "..."
|
||||
}
|
||||
```
|
||||
|
||||
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
|
||||
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
|
||||
- `model_path` - the path to model file (same with `-m` argument)
|
||||
- `chat_template` - the model's original Jinja2 prompt template
|
||||
|
||||
### POST `/props`: Change server global properties.
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -22,7 +22,12 @@ def test_server_props():
|
|||
server.start()
|
||||
res = server.make_request("GET", "/props")
|
||||
assert res.status_code == 200
|
||||
assert ".gguf" in res.body["model_path"]
|
||||
assert res.body["total_slots"] == server.n_slots
|
||||
default_val = res.body["default_generation_settings"]
|
||||
assert server.n_ctx is not None and server.n_slots is not None
|
||||
assert default_val["n_ctx"] == server.n_ctx / server.n_slots
|
||||
assert default_val["params"]["seed"] == server.seed
|
||||
|
||||
|
||||
def test_server_models():
|
||||
|
@ -33,6 +38,31 @@ def test_server_models():
|
|||
assert len(res.body["data"]) == 1
|
||||
assert res.body["data"][0]["id"] == server.model_alias
|
||||
|
||||
|
||||
def test_server_slots():
|
||||
global server
|
||||
|
||||
# without slots endpoint enabled, this should return error
|
||||
server.server_slots = False
|
||||
server.start()
|
||||
res = server.make_request("GET", "/slots")
|
||||
assert res.status_code == 501 # ERROR_TYPE_NOT_SUPPORTED
|
||||
assert "error" in res.body
|
||||
server.stop()
|
||||
|
||||
# with slots endpoint enabled, this should return slots info
|
||||
server.server_slots = True
|
||||
server.n_slots = 2
|
||||
server.start()
|
||||
res = server.make_request("GET", "/slots")
|
||||
assert res.status_code == 200
|
||||
assert len(res.body) == server.n_slots
|
||||
assert server.n_ctx is not None and server.n_slots is not None
|
||||
assert res.body[0]["n_ctx"] == server.n_ctx / server.n_slots
|
||||
assert "params" in res.body[0]
|
||||
assert res.body[0]["params"]["seed"] == server.seed
|
||||
|
||||
|
||||
def test_load_split_model():
|
||||
global server
|
||||
server.model_hf_repo = "ggml-org/models"
|
||||
|
|
|
@ -30,6 +30,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
|
|||
],
|
||||
})
|
||||
assert res.status_code == 200
|
||||
assert "cmpl" in res.body["id"] # make sure the completion id has the expected format
|
||||
assert res.body["model"] == model if model is not None else server.model_alias
|
||||
assert res.body["usage"]["prompt_tokens"] == n_prompt
|
||||
assert res.body["usage"]["completion_tokens"] == n_predicted
|
||||
|
@ -59,9 +60,13 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte
|
|||
"stream": True,
|
||||
})
|
||||
content = ""
|
||||
last_cmpl_id = None
|
||||
for data in res:
|
||||
choice = data["choices"][0]
|
||||
assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
|
||||
if last_cmpl_id is None:
|
||||
last_cmpl_id = data["id"]
|
||||
assert last_cmpl_id == data["id"] # make sure the completion id is the same for all events in the stream
|
||||
if choice["finish_reason"] in ["stop", "length"]:
|
||||
assert data["usage"]["prompt_tokens"] == n_prompt
|
||||
assert data["usage"]["completion_tokens"] == n_predicted
|
||||
|
|
|
@ -64,6 +64,7 @@ class ServerProcess:
|
|||
server_embeddings: bool | None = False
|
||||
server_reranking: bool | None = False
|
||||
server_metrics: bool | None = False
|
||||
server_slots: bool | None = False
|
||||
draft: int | None = None
|
||||
api_key: str | None = None
|
||||
response_format: str | None = None
|
||||
|
@ -91,7 +92,6 @@ class ServerProcess:
|
|||
else:
|
||||
server_path = "../../../build/bin/llama-server"
|
||||
server_args = [
|
||||
"--slots", # requires to get slot status via /slots endpoint
|
||||
"--host",
|
||||
self.server_host,
|
||||
"--port",
|
||||
|
@ -129,6 +129,8 @@ class ServerProcess:
|
|||
server_args.append("--reranking")
|
||||
if self.server_metrics:
|
||||
server_args.append("--metrics")
|
||||
if self.server_slots:
|
||||
server_args.append("--slots")
|
||||
if self.model_alias:
|
||||
server_args.extend(["--alias", self.model_alias])
|
||||
if self.n_ctx:
|
||||
|
@ -181,7 +183,7 @@ class ServerProcess:
|
|||
start_time = time.time()
|
||||
while time.time() - start_time < timeout_seconds:
|
||||
try:
|
||||
response = self.make_request("GET", "/slots", headers={
|
||||
response = self.make_request("GET", "/health", headers={
|
||||
"Authorization": f"Bearer {self.api_key}" if self.api_key else None
|
||||
})
|
||||
if response.status_code == 200:
|
||||
|
@ -224,7 +226,7 @@ class ServerProcess:
|
|||
result.headers = dict(response.headers)
|
||||
result.status_code = response.status_code
|
||||
result.body = response.json() if parse_body else None
|
||||
print("Response from server", result.body)
|
||||
print("Response from server", json.dumps(result.body, indent=2))
|
||||
return result
|
||||
|
||||
def make_stream_request(
|
||||
|
@ -245,7 +247,7 @@ class ServerProcess:
|
|||
break
|
||||
elif line.startswith('data: '):
|
||||
data = json.loads(line[6:])
|
||||
print("Partial response from server", data)
|
||||
print("Partial response from server", json.dumps(data, indent=2))
|
||||
yield data
|
||||
|
||||
|
||||
|
|
|
@ -164,6 +164,9 @@ static std::vector<llama_tokens> tokenize_input_prompts(llama_context * ctx, con
|
|||
} else {
|
||||
throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts");
|
||||
}
|
||||
if (result.empty()) {
|
||||
throw std::runtime_error("\"prompt\" must not be empty");
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -496,8 +499,6 @@ static json oaicompat_completion_params_parse(
|
|||
const std::string & chat_template) {
|
||||
json llama_params;
|
||||
|
||||
llama_params["__oaicompat"] = true;
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
||||
|
||||
|
@ -648,3 +649,18 @@ static json format_detokenized_response(const std::string & content) {
|
|||
{"content", content}
|
||||
};
|
||||
}
|
||||
|
||||
static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias) {
|
||||
json data = json::array();
|
||||
for (const auto & lb : logit_bias) {
|
||||
data.push_back(json{
|
||||
{"bias", lb.bias},
|
||||
{"token", lb.token},
|
||||
});
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
static std::string safe_json_to_str(json data) {
|
||||
return data.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
}
|
||||
|
|
|
@ -8,6 +8,20 @@ if (Vulkan_FOUND)
|
|||
../../include/ggml-vulkan.h
|
||||
)
|
||||
|
||||
# Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported.
|
||||
# If it's not, there will be an error to stderr.
|
||||
# If it's supported, set a define to indicate that we should compile those shaders
|
||||
execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp"
|
||||
OUTPUT_VARIABLE glslc_output
|
||||
ERROR_VARIABLE glslc_error)
|
||||
|
||||
if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*")
|
||||
message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc")
|
||||
else()
|
||||
message(STATUS "GL_NV_cooperative_matrix2 supported by glslc")
|
||||
add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
endif()
|
||||
|
||||
target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan)
|
||||
target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
|
|
|
@ -168,7 +168,11 @@ struct vk_device_struct {
|
|||
uint32_t subgroup_size;
|
||||
uint32_t shader_core_count;
|
||||
bool uma;
|
||||
bool coopmat2;
|
||||
|
||||
bool subgroup_size_control;
|
||||
uint32_t subgroup_min_size;
|
||||
uint32_t subgroup_max_size;
|
||||
bool subgroup_require_full_support;
|
||||
|
||||
bool coopmat_support;
|
||||
bool coopmat_acc_f32_support;
|
||||
|
@ -176,6 +180,7 @@ struct vk_device_struct {
|
|||
uint32_t coopmat_m;
|
||||
uint32_t coopmat_n;
|
||||
uint32_t coopmat_k;
|
||||
bool coopmat2;
|
||||
|
||||
size_t idx;
|
||||
|
||||
|
@ -753,8 +758,12 @@ static uint32_t compile_count = 0;
|
|||
static std::mutex compile_count_mutex;
|
||||
static std::condition_variable compile_count_cond;
|
||||
|
||||
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align, bool disable_robustness) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
||||
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint,
|
||||
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants,
|
||||
uint32_t align, bool disable_robustness, bool require_full_subgroups, uint32_t required_subgroup_size) {
|
||||
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size <<
|
||||
", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align <<
|
||||
", " << disable_robustness << ", " << require_full_subgroups << ", " << required_subgroup_size << ")");
|
||||
GGML_ASSERT(parameter_count > 0);
|
||||
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
||||
|
||||
|
@ -813,14 +822,28 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin
|
|||
specialization_constants.data()
|
||||
);
|
||||
|
||||
vk::PipelineShaderStageCreateFlags pipeline_shader_stage_create_flags{};
|
||||
|
||||
if (device->subgroup_require_full_support && require_full_subgroups) {
|
||||
pipeline_shader_stage_create_flags |= vk::PipelineShaderStageCreateFlagBits::eRequireFullSubgroupsEXT;
|
||||
}
|
||||
|
||||
vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
|
||||
vk::PipelineShaderStageCreateFlags(),
|
||||
pipeline_shader_stage_create_flags,
|
||||
vk::ShaderStageFlagBits::eCompute,
|
||||
pipeline->shader_module,
|
||||
entrypoint.c_str(),
|
||||
&specialization_info);
|
||||
|
||||
vk::PipelineShaderStageRequiredSubgroupSizeCreateInfoEXT pipeline_shader_stage_required_subgroup_size_create_info;
|
||||
pipeline_shader_stage_required_subgroup_size_create_info.requiredSubgroupSize = required_subgroup_size;
|
||||
if (device->subgroup_size_control && required_subgroup_size > 0) {
|
||||
GGML_ASSERT(device->subgroup_min_size <= required_subgroup_size && required_subgroup_size <= device->subgroup_max_size);
|
||||
pipeline_shader_create_info.setPNext(&pipeline_shader_stage_required_subgroup_size_create_info);
|
||||
}
|
||||
|
||||
vk::ComputePipelineCreateInfo compute_pipeline_create_info(
|
||||
vk::PipelineCreateFlags(),
|
||||
vk::PipelineCreateFlags{},
|
||||
pipeline_shader_create_info,
|
||||
pipeline->layout);
|
||||
|
||||
|
@ -1500,7 +1523,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
device->pipeline_matmul_id_f32 = std::make_shared<vk_matmul_pipeline_struct>();
|
||||
|
||||
std::vector<std::future<void>> compiles;
|
||||
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants, uint32_t align, bool disable_robustness = false) {
|
||||
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint,
|
||||
uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, const std::vector<uint32_t>& specialization_constants,
|
||||
uint32_t align, bool disable_robustness = false, bool require_full_subgroups = false, uint32_t required_subgroup_size = 0) {
|
||||
{
|
||||
// wait until fewer than N compiles are in progress
|
||||
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
||||
|
@ -1510,10 +1535,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
}
|
||||
compile_count++;
|
||||
}
|
||||
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness));
|
||||
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint,
|
||||
parameter_count, push_constant_size, wg_denoms, specialization_constants, align, disable_robustness, require_full_subgroups, required_subgroup_size));
|
||||
};
|
||||
|
||||
#if defined(VK_NV_cooperative_matrix2)
|
||||
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
if (device->coopmat2) {
|
||||
|
||||
auto const &fa_wg_denoms = [&](uint32_t D, uint32_t clamp, ggml_type type, bool small_rows) -> std::array<uint32_t, 3> {
|
||||
|
@ -1611,22 +1637,22 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|||
#undef CREATE_MM
|
||||
#undef CREATE_MM2
|
||||
} else
|
||||
#endif // defined(VK_NV_cooperative_matrix2)
|
||||
#endif // defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
if (device->coopmat_support) {
|
||||
// Create 6 variants, {s,m,l}x{unaligned,aligned}
|
||||
#define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||
if (device->mul_mat ## ID ## _l) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \
|
||||
if (device->mul_mat ## ID ## _m) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \
|
||||
if (device->mul_mat ## ID ## _s) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _coopmat_len, NAMELC ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \
|
||||
if (device->mul_mat ## ID ## _l) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \
|
||||
if (device->mul_mat ## ID ## _m) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \
|
||||
if (device->mul_mat ## ID ## _s) \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align); \
|
||||
ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _coopmat_len, NAMELC ## _aligned ## F16ACC ## _coopmat_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \
|
||||
|
||||
// Create 2 variants, {f16,f32} accumulator
|
||||
#define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \
|
||||
|
@ -1999,6 +2025,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
amd_shader_core_properties2 = true;
|
||||
} else if (strcmp("VK_EXT_pipeline_robustness", properties.extensionName) == 0) {
|
||||
pipeline_robustness = true;
|
||||
} else if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) {
|
||||
device->subgroup_size_control = true;
|
||||
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT")) {
|
||||
device->coopmat_support = true;
|
||||
|
@ -2018,6 +2046,8 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
vk::PhysicalDeviceDriverProperties driver_props;
|
||||
vk::PhysicalDeviceShaderSMBuiltinsPropertiesNV sm_props;
|
||||
vk::PhysicalDeviceShaderCoreProperties2AMD amd_shader_core_properties2_props;
|
||||
vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props;
|
||||
|
||||
props2.pNext = &props3;
|
||||
props3.pNext = &subgroup_props;
|
||||
subgroup_props.pNext = &driver_props;
|
||||
|
@ -2036,6 +2066,10 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
last_struct->pNext = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
|
||||
last_struct = (VkBaseOutStructure *)&amd_shader_core_properties2_props;
|
||||
}
|
||||
if (device->subgroup_size_control) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_props;
|
||||
last_struct = (VkBaseOutStructure *)&subgroup_size_control_props;
|
||||
}
|
||||
|
||||
#if defined(VK_NV_cooperative_matrix2)
|
||||
vk::PhysicalDeviceCooperativeMatrix2PropertiesNV coopmat2_props;
|
||||
|
@ -2073,11 +2107,11 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
|
||||
device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
|
||||
|
||||
if (device->vendor_id == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
|
||||
// Intel drivers don't support coopmat properly yet
|
||||
// Only RADV supports coopmat properly on AMD
|
||||
device->coopmat_support = false;
|
||||
}
|
||||
// if (device->vendor_id == VK_VENDOR_ID_INTEL || (device->vendor_id == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
|
||||
// // Intel drivers don't support coopmat properly yet
|
||||
// // Only RADV supports coopmat properly on AMD
|
||||
// device->coopmat_support = false;
|
||||
// }
|
||||
|
||||
std::vector<vk::QueueFamilyProperties> queue_family_props = device->physical_device.getQueueFamilyProperties();
|
||||
|
||||
|
@ -2129,6 +2163,17 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
device_extensions.push_back("VK_EXT_pipeline_robustness");
|
||||
}
|
||||
|
||||
VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features;
|
||||
subgroup_size_control_features.pNext = nullptr;
|
||||
subgroup_size_control_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT;
|
||||
subgroup_size_control_features.computeFullSubgroups = false;
|
||||
subgroup_size_control_features.subgroupSizeControl = false;
|
||||
|
||||
if (device->subgroup_size_control) {
|
||||
last_struct->pNext = (VkBaseOutStructure *)&subgroup_size_control_features;
|
||||
last_struct = (VkBaseOutStructure *)&subgroup_size_control_features;
|
||||
}
|
||||
|
||||
VkPhysicalDeviceCooperativeMatrixFeaturesKHR coopmat_features;
|
||||
coopmat_features.pNext = nullptr;
|
||||
coopmat_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
|
||||
|
@ -2156,10 +2201,21 @@ static vk_device ggml_vk_get_device(size_t idx) {
|
|||
|
||||
device->pipeline_robustness = pl_robustness_features.pipelineRobustness;
|
||||
|
||||
device->subgroup_size_control = device->subgroup_size_control &&
|
||||
(!(subgroup_size_control_props.requiredSubgroupSizeStages & vk::ShaderStageFlagBits::eCompute) ||
|
||||
!subgroup_size_control_features.subgroupSizeControl);
|
||||
|
||||
if (device->subgroup_size_control) {
|
||||
device->subgroup_min_size = subgroup_size_control_props.minSubgroupSize;
|
||||
device->subgroup_max_size = subgroup_size_control_props.maxSubgroupSize;
|
||||
device->subgroup_require_full_support = subgroup_size_control_features.computeFullSubgroups;
|
||||
device_extensions.push_back("VK_EXT_subgroup_size_control");
|
||||
}
|
||||
|
||||
device->coopmat_support = device->coopmat_support && coopmat_features.cooperativeMatrix;
|
||||
|
||||
if (coopmat2_support) {
|
||||
#if defined(VK_NV_cooperative_matrix2)
|
||||
#if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
if (coopmat2_features.cooperativeMatrixWorkgroupScope &&
|
||||
coopmat2_features.cooperativeMatrixFlexibleDimensions &&
|
||||
coopmat2_features.cooperativeMatrixReductions &&
|
||||
|
@ -2420,22 +2476,27 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|||
bool fp16_storage = false;
|
||||
bool fp16_compute = false;
|
||||
bool coopmat_support = false;
|
||||
bool coopmat2_support = false;
|
||||
|
||||
for (auto properties : ext_props) {
|
||||
if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
|
||||
fp16_storage = true;
|
||||
} else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
|
||||
fp16_compute = true;
|
||||
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0) {
|
||||
} else if (strcmp("VK_KHR_cooperative_matrix", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT")) {
|
||||
coopmat_support = true;
|
||||
} else if (strcmp("VK_NV_cooperative_matrix2", properties.extensionName) == 0 &&
|
||||
!getenv("GGML_VK_DISABLE_COOPMAT2")) {
|
||||
coopmat2_support = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
|
||||
// Intel drivers don't support coopmat properly yet
|
||||
// Only RADV supports coopmat properly on AMD
|
||||
coopmat_support = false;
|
||||
}
|
||||
// if (props2.properties.vendorID == VK_VENDOR_ID_INTEL || (props2.properties.vendorID == VK_VENDOR_ID_AMD && driver_props.driverID == vk::DriverId::eAmdProprietary)) {
|
||||
// // Intel drivers don't support coopmat properly yet
|
||||
// // Only RADV supports coopmat properly on AMD
|
||||
// coopmat_support = false;
|
||||
// }
|
||||
|
||||
const char* GGML_VK_DISABLE_F16 = getenv("GGML_VK_DISABLE_F16");
|
||||
bool force_disable_f16 = GGML_VK_DISABLE_F16 != nullptr;
|
||||
|
@ -2478,9 +2539,11 @@ static void ggml_vk_print_gpu_info(size_t idx) {
|
|||
|
||||
coopmat_support = coopmat_support && coopmat_features.cooperativeMatrix;
|
||||
|
||||
std::string matrix_cores = coopmat2_support ? "NV_coopmat2" : coopmat_support ? "KHR_coopmat" : "none";
|
||||
|
||||
std::string device_name = props2.properties.deviceName.data();
|
||||
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %d\n",
|
||||
idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, coopmat_support);
|
||||
GGML_LOG_DEBUG("ggml_vulkan: %zu = %s (%s) | uma: %d | fp16: %d | warp size: %zu | matrix cores: %s\n",
|
||||
idx, device_name.c_str(), driver_props.driverName.data(), uma, fp16, subgroup_size, matrix_cores.c_str());
|
||||
|
||||
if (props2.properties.deviceType == vk::PhysicalDeviceType::eCpu) {
|
||||
GGML_LOG_DEBUG("ggml_vulkan: Warning: Device type is CPU. This is probably not the device you want.\n");
|
||||
|
|
|
@ -0,0 +1,7 @@
|
|||
#version 460
|
||||
|
||||
#extension GL_NV_cooperative_matrix2 : require
|
||||
|
||||
void main()
|
||||
{
|
||||
}
|
|
@ -342,14 +342,14 @@ void process_shaders() {
|
|||
matmul_shaders(true, matmul_id, true, false, false);
|
||||
matmul_shaders(true, matmul_id, true, false, true);
|
||||
|
||||
#if defined(VK_NV_cooperative_matrix2)
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
// Coopmat2, fp32acc and fp16acc
|
||||
matmul_shaders(true, matmul_id, false, true, false);
|
||||
matmul_shaders(true, matmul_id, false, true, true);
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(VK_NV_cooperative_matrix2)
|
||||
#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT)
|
||||
// flash attention
|
||||
for (const auto& f16acc : {false, true}) {
|
||||
std::string acctype = f16acc ? "float16_t" : "float";
|
||||
|
|
|
@ -761,6 +761,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue