Continue vulkan implementation and optimization
This commit is contained in:
parent
0c9cca00bd
commit
2c70df985a
3 changed files with 302 additions and 110 deletions
1
Makefile
1
Makefile
|
@ -215,6 +215,7 @@ endif # LLAMA_METAL
|
||||||
|
|
||||||
ifdef LLAMA_VULKAN
|
ifdef LLAMA_VULKAN
|
||||||
CFLAGS += -DGGML_USE_VULKAN
|
CFLAGS += -DGGML_USE_VULKAN
|
||||||
|
CXXFLAGS += -DGGML_USE_VULKAN
|
||||||
LDFLAGS += -lvulkan -lopenblas -lcblas
|
LDFLAGS += -lvulkan -lopenblas -lcblas
|
||||||
OBJS += ggml-vulkan.o
|
OBJS += ggml-vulkan.o
|
||||||
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h
|
||||||
|
|
377
ggml-vulkan.cpp
377
ggml-vulkan.cpp
|
@ -1,7 +1,9 @@
|
||||||
#include "ggml-vulkan.h"
|
#include "ggml-vulkan.h"
|
||||||
|
|
||||||
|
#ifdef VK_CHK_KERNEL
|
||||||
#include <cblas.h>
|
#include <cblas.h>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <vulkan/vulkan.hpp>
|
#include <vulkan/vulkan.hpp>
|
||||||
#define VMA_IMPLEMENTATION
|
#define VMA_IMPLEMENTATION
|
||||||
|
@ -33,6 +35,8 @@ inline static void* ggml_aligned_malloc(size_t size, size_t alignment) {
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
|
|
||||||
|
@ -42,8 +46,10 @@ inline static void* ggml_aligned_malloc(size_t size, size_t alignment) {
|
||||||
|
|
||||||
vk::Instance vk_instance;
|
vk::Instance vk_instance;
|
||||||
uint32_t vk_compute_queue_family_index;
|
uint32_t vk_compute_queue_family_index;
|
||||||
|
uint32_t vk_transfer_queue_family_index;
|
||||||
vk::PhysicalDevice vk_physical_device;
|
vk::PhysicalDevice vk_physical_device;
|
||||||
vk::Device vk_device;
|
vk::Device vk_device;
|
||||||
|
vk::CommandPool vk_command_pool_compute, vk_command_pool_transfer;
|
||||||
VmaAllocator vk_allocator;
|
VmaAllocator vk_allocator;
|
||||||
vk::DescriptorSetLayout vk_pipeline_matmul_dsl;
|
vk::DescriptorSetLayout vk_pipeline_matmul_dsl;
|
||||||
vk::PipelineLayout vk_pipeline_matmul_layout;
|
vk::PipelineLayout vk_pipeline_matmul_layout;
|
||||||
|
@ -53,6 +59,15 @@ vk::Buffer vk_buffer_qa, vk_buffer_a, vk_buffer_b, vk_buffer_c;
|
||||||
|
|
||||||
bool vk_fp16_support = false;
|
bool vk_fp16_support = false;
|
||||||
|
|
||||||
|
struct vk_buffer {
|
||||||
|
vk::Buffer buffer;
|
||||||
|
VmaAllocation allocation;
|
||||||
|
VmaAllocationInfo info;
|
||||||
|
size_t size = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
static std::vector<std::tuple<void*, size_t, vk_buffer>> vk_buf_list;
|
||||||
|
|
||||||
void ggml_vk_init(void) {
|
void ggml_vk_init(void) {
|
||||||
char* GGML_VULKAN_DEVICE = getenv("GGML_VULKAN_DEVICE");
|
char* GGML_VULKAN_DEVICE = getenv("GGML_VULKAN_DEVICE");
|
||||||
int dev_num = (GGML_VULKAN_DEVICE == NULL ? 0 : atoi(GGML_VULKAN_DEVICE));
|
int dev_num = (GGML_VULKAN_DEVICE == NULL ? 0 : atoi(GGML_VULKAN_DEVICE));
|
||||||
|
@ -67,15 +82,47 @@ void ggml_vk_init(void) {
|
||||||
std::cout << "ggml_vulkan: Using " << device_props.deviceName << std::endl;
|
std::cout << "ggml_vulkan: Using " << device_props.deviceName << std::endl;
|
||||||
|
|
||||||
std::vector<vk::QueueFamilyProperties> queue_family_props = vk_physical_device.getQueueFamilyProperties();
|
std::vector<vk::QueueFamilyProperties> queue_family_props = vk_physical_device.getQueueFamilyProperties();
|
||||||
auto prop_it = std::find_if(queue_family_props.begin(), queue_family_props.end(), [](const vk::QueueFamilyProperties& prop)
|
|
||||||
{
|
|
||||||
return prop.queueFlags & vk::QueueFlagBits::eCompute;
|
|
||||||
});
|
|
||||||
vk_compute_queue_family_index = std::distance(queue_family_props.begin(), prop_it);
|
|
||||||
|
|
||||||
const float queue_priority = 1.0f;
|
const size_t qfsize = queue_family_props.size();
|
||||||
vk::DeviceQueueCreateInfo device_queue_create_info(vk::DeviceQueueCreateFlags(), vk_compute_queue_family_index, 1, &queue_priority);
|
|
||||||
vk::DeviceCreateInfo device_create_info(vk::DeviceCreateFlags(), device_queue_create_info);
|
// Try to find a non-graphics compute queue and a transfer-focused queue
|
||||||
|
vk_compute_queue_family_index = qfsize;
|
||||||
|
vk_transfer_queue_family_index = qfsize;
|
||||||
|
for (size_t i = 0; i < qfsize; i++) {
|
||||||
|
// std::cout << i << ": " << "compute=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eCompute) << " transfer=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eTransfer) << " graphics=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eGraphics) << " protected=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eProtected) << " optical_flow_nv=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eOpticalFlowNV) << " sparse binding=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eSparseBinding) << " video decode=" << bool(queue_family_props[i].queueFlags & vk::QueueFlagBits::eVideoDecodeKHR) << std::endl;
|
||||||
|
if (vk_compute_queue_family_index >= qfsize && !(queue_family_props[i].queueFlags & vk::QueueFlagBits::eGraphics) && queue_family_props[i].queueFlags & vk::QueueFlagBits::eCompute) {
|
||||||
|
vk_compute_queue_family_index = i;
|
||||||
|
}
|
||||||
|
if (vk_transfer_queue_family_index >= qfsize && !(queue_family_props[i].queueFlags & (vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics | vk::QueueFlagBits::eVideoDecodeKHR | vk::QueueFlagBits::eProtected | vk::QueueFlagBits::eOpticalFlowNV)) && queue_family_props[i].queueFlags & vk::QueueFlagBits::eTransfer) {
|
||||||
|
vk_transfer_queue_family_index = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to graphics and compute queue if not yet found
|
||||||
|
if (vk_compute_queue_family_index == qfsize) {
|
||||||
|
for (size_t i = 0; i < qfsize; i++) {
|
||||||
|
if (vk_compute_queue_family_index >= qfsize && queue_family_props[i].queueFlags & vk::QueueFlagBits::eCompute) {
|
||||||
|
vk_compute_queue_family_index = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (vk_compute_queue_family_index == qfsize) {
|
||||||
|
std::cerr << "ggml_vulkan: vk_compute_queue_family_index invalid" << std::endl;
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
if (vk_transfer_queue_family_index == qfsize) {
|
||||||
|
std::cerr << "ggml_vulkan: vk_transfer_queue_family_index invalid" << std::endl;
|
||||||
|
abort();
|
||||||
|
}
|
||||||
|
|
||||||
|
const float compute_queue_priority = 1.0f;
|
||||||
|
const float transfer_queue_priority = 1.0f;
|
||||||
|
vk::DeviceQueueCreateInfo device_queue_create_infos[] = {
|
||||||
|
{vk::DeviceQueueCreateFlags(), vk_compute_queue_family_index, 1, &compute_queue_priority},
|
||||||
|
{vk::DeviceQueueCreateFlags(), vk_transfer_queue_family_index, 1, &transfer_queue_priority},
|
||||||
|
};
|
||||||
|
vk::DeviceCreateInfo device_create_info(vk::DeviceCreateFlags(), device_queue_create_infos);
|
||||||
vk_device = vk_physical_device.createDevice(device_create_info);
|
vk_device = vk_physical_device.createDevice(device_create_info);
|
||||||
|
|
||||||
// Allocator
|
// Allocator
|
||||||
|
@ -133,6 +180,12 @@ void ggml_vk_init(void) {
|
||||||
pipeline_shader_create_info,
|
pipeline_shader_create_info,
|
||||||
vk_pipeline_matmul_layout);
|
vk_pipeline_matmul_layout);
|
||||||
vk_pipeline_matmul = vk_device.createComputePipeline(pipeline_cache, compute_pipeline_create_info).value;
|
vk_pipeline_matmul = vk_device.createComputePipeline(pipeline_cache, compute_pipeline_create_info).value;
|
||||||
|
|
||||||
|
vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(), vk_compute_queue_family_index);
|
||||||
|
vk_command_pool_compute = vk_device.createCommandPool(command_pool_create_info_compute);
|
||||||
|
|
||||||
|
vk::CommandPoolCreateInfo command_pool_create_info_transfer(vk::CommandPoolCreateFlags(), vk_transfer_queue_family_index);
|
||||||
|
vk_command_pool_transfer = vk_device.createCommandPool(command_pool_create_info_transfer);
|
||||||
}
|
}
|
||||||
|
|
||||||
// buffer pool for vulkan
|
// buffer pool for vulkan
|
||||||
|
@ -152,17 +205,43 @@ struct scoped_spin_lock {
|
||||||
scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
|
scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct vk_buffer {
|
|
||||||
vk::Buffer buffer;
|
|
||||||
VmaAllocation allocation;
|
|
||||||
VmaAllocationInfo info;
|
|
||||||
size_t size = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
static vk_buffer g_vk_buffer_pool[MAX_VK_BUFFERS];
|
static vk_buffer g_vk_buffer_pool[MAX_VK_BUFFERS];
|
||||||
static std::atomic_flag g_vk_pool_lock = ATOMIC_FLAG_INIT;
|
static std::atomic_flag g_vk_pool_lock = ATOMIC_FLAG_INIT;
|
||||||
|
|
||||||
static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf) {
|
static vk_buffer ggml_vk_create_buffer(size_t size, VmaAllocationCreateFlags alloc_flags, VmaMemoryUsage vma_usage, VkMemoryPropertyFlags req_flags = 0) {
|
||||||
|
vk_buffer buf;
|
||||||
|
|
||||||
|
buf.size = size;
|
||||||
|
vk::BufferCreateInfo buffer_create_info{
|
||||||
|
vk::BufferCreateFlags(),
|
||||||
|
size,
|
||||||
|
vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
|
||||||
|
vk::SharingMode::eExclusive,
|
||||||
|
1,
|
||||||
|
&vk_compute_queue_family_index
|
||||||
|
};
|
||||||
|
|
||||||
|
VmaAllocationCreateInfo allocation_info = {};
|
||||||
|
allocation_info.requiredFlags = req_flags;
|
||||||
|
allocation_info.flags = alloc_flags;
|
||||||
|
allocation_info.usage = vma_usage;
|
||||||
|
|
||||||
|
vmaCreateBuffer(vk_allocator,
|
||||||
|
(VkBufferCreateInfo*)&buffer_create_info,
|
||||||
|
&allocation_info,
|
||||||
|
(VkBuffer*)&buf.buffer,
|
||||||
|
&buf.allocation,
|
||||||
|
&buf.info);
|
||||||
|
|
||||||
|
return buf;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_vk_destroy_buffer(vk_buffer& buf) {
|
||||||
|
buf.size = 0;
|
||||||
|
vmaDestroyBuffer(vk_allocator, buf.buffer, buf.allocation);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf, VmaAllocationCreateFlags alloc_flags) {
|
||||||
scoped_spin_lock lock(g_vk_pool_lock);
|
scoped_spin_lock lock(g_vk_pool_lock);
|
||||||
|
|
||||||
int best_i = -1;
|
int best_i = -1;
|
||||||
|
@ -190,56 +269,72 @@ static void ggml_vk_pool_malloc(size_t size, vk_buffer* buf) {
|
||||||
if(worst_i != -1) {
|
if(worst_i != -1) {
|
||||||
//no buffer that fits our needs, resize largest one to save memory
|
//no buffer that fits our needs, resize largest one to save memory
|
||||||
vk_buffer& b = g_vk_buffer_pool[worst_i];
|
vk_buffer& b = g_vk_buffer_pool[worst_i];
|
||||||
b.size = 0;
|
ggml_vk_destroy_buffer(b);
|
||||||
vmaDestroyBuffer(vk_allocator, b.buffer, b.allocation);
|
|
||||||
}
|
}
|
||||||
buf->size = size;
|
|
||||||
|
|
||||||
vk::BufferCreateInfo buffer_create_info{
|
*buf = ggml_vk_create_buffer(size, alloc_flags, VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE, 0);
|
||||||
vk::BufferCreateFlags(),
|
|
||||||
size,
|
|
||||||
vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eTransferDst,
|
|
||||||
vk::SharingMode::eExclusive,
|
|
||||||
1,
|
|
||||||
&vk_compute_queue_family_index
|
|
||||||
};
|
|
||||||
|
|
||||||
VmaAllocationCreateInfo allocation_info = {};
|
|
||||||
allocation_info.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE;
|
|
||||||
allocation_info.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT;
|
|
||||||
|
|
||||||
vmaCreateBuffer(vk_allocator,
|
|
||||||
(VkBufferCreateInfo*)&buffer_create_info,
|
|
||||||
&allocation_info,
|
|
||||||
(VkBuffer*)&buf->buffer,
|
|
||||||
&buf->allocation,
|
|
||||||
&buf->info);
|
|
||||||
|
|
||||||
VkMemoryPropertyFlags mem_prop_flags;
|
|
||||||
vmaGetAllocationMemoryProperties(vk_allocator, buf->allocation, &mem_prop_flags);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_pool_free(vk_buffer* buffer) {
|
static void ggml_vk_pool_free(vk_buffer& buffer) {
|
||||||
scoped_spin_lock lock(g_vk_pool_lock);
|
scoped_spin_lock lock(g_vk_pool_lock);
|
||||||
|
|
||||||
for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
|
for (int i = 0; i < MAX_VK_BUFFERS; ++i) {
|
||||||
vk_buffer& b = g_vk_buffer_pool[i];
|
vk_buffer& b = g_vk_buffer_pool[i];
|
||||||
if (b.size == 0) {
|
if (b.size == 0) {
|
||||||
b = *buffer;
|
b = buffer;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stderr, "WARNING: vk buffer pool full, increase MAX_VK_BUFFERS\n");
|
fprintf(stderr, "WARNING: vk buffer pool full, increase MAX_VK_BUFFERS\n");
|
||||||
buffer->size = 0;
|
ggml_vk_destroy_buffer(buffer);
|
||||||
vmaDestroyBuffer(vk_allocator, buffer->buffer, buffer->allocation);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static vk::CommandBuffer ggml_vk_cmd_buffer_create() {
|
void* ggml_vk_host_malloc(size_t size) {
|
||||||
vk::CommandPoolCreateInfo command_pool_create_info(vk::CommandPoolCreateFlags(), vk_compute_queue_family_index);
|
if (getenv("GGML_VK_NO_PINNED") != nullptr) {
|
||||||
vk::CommandPool command_pool = vk_device.createCommandPool(command_pool_create_info);
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
vk_buffer buf = ggml_vk_create_buffer(size, VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, VMA_MEMORY_USAGE_AUTO_PREFER_HOST, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
|
||||||
|
|
||||||
|
VkMemoryPropertyFlags mem_prop_flags;
|
||||||
|
vmaGetAllocationMemoryProperties(vk_allocator, buf.allocation, &mem_prop_flags);
|
||||||
|
|
||||||
|
if(!(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT)) {
|
||||||
|
fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
|
||||||
|
size/1024.0/1024.0);
|
||||||
|
buf.size = 0;
|
||||||
|
vmaDestroyBuffer(vk_allocator, buf.buffer, buf.allocation);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("allocate %.2f MB of pinned memory\n", size/1024.0/1024.0);
|
||||||
|
|
||||||
|
vk_buf_list.push_back(std::make_tuple(buf.info.pMappedData, size, buf));
|
||||||
|
|
||||||
|
return buf.info.pMappedData;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_vk_host_free(void* ptr) {
|
||||||
|
vk_buffer* buf = nullptr;
|
||||||
|
for (size_t i = 0; i < vk_buf_list.size(); i++) {
|
||||||
|
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_buf_list[i]);
|
||||||
|
const uint8_t* endr = addr + std::get<1>(vk_buf_list[i]);
|
||||||
|
if (ptr >= addr && ptr < endr) {
|
||||||
|
buf = &std::get<2>(vk_buf_list[i]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (buf == nullptr) {
|
||||||
|
fprintf(stderr, "WARNING: to free pinned memory: memory not in map\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_vk_destroy_buffer(*buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
static vk::CommandBuffer ggml_vk_cmd_buffer_create(vk::CommandPool& pool) {
|
||||||
vk::CommandBufferAllocateInfo command_buffer_alloc_info(
|
vk::CommandBufferAllocateInfo command_buffer_alloc_info(
|
||||||
command_pool,
|
pool,
|
||||||
vk::CommandBufferLevel::ePrimary,
|
vk::CommandBufferLevel::ePrimary,
|
||||||
1);
|
1);
|
||||||
const std::vector<vk::CommandBuffer> cmd_buffers = vk_device.allocateCommandBuffers(command_buffer_alloc_info);
|
const std::vector<vk::CommandBuffer> cmd_buffers = vk_device.allocateCommandBuffers(command_buffer_alloc_info);
|
||||||
|
@ -250,48 +345,40 @@ static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src
|
||||||
VkMemoryPropertyFlags mem_prop_flags;
|
VkMemoryPropertyFlags mem_prop_flags;
|
||||||
vmaGetAllocationMemoryProperties(vk_allocator, dst->allocation, &mem_prop_flags);
|
vmaGetAllocationMemoryProperties(vk_allocator, dst->allocation, &mem_prop_flags);
|
||||||
|
|
||||||
|
// Buffer is already mapped
|
||||||
if(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
|
if(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) {
|
||||||
memcpy(dst->info.pMappedData, src, size);
|
memcpy(dst->info.pMappedData, src, size);
|
||||||
if (!(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
|
if (!(mem_prop_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
|
||||||
vmaFlushAllocation(vk_allocator, dst->allocation, 0, VK_WHOLE_SIZE);
|
vmaFlushAllocation(vk_allocator, dst->allocation, 0, VK_WHOLE_SIZE);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Allocation ended up in a non-mappable memory - need to transfer.
|
// Check if src is pinned memory
|
||||||
VkBufferCreateInfo staging_buf_create_info = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
|
vk_buffer* buf = nullptr;
|
||||||
staging_buf_create_info.size = size;
|
size_t buf_offset = 0;
|
||||||
staging_buf_create_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
|
for (size_t i = 0; i < vk_buf_list.size(); i++) {
|
||||||
|
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_buf_list[i]);
|
||||||
|
const uint8_t* endr = addr + std::get<1>(vk_buf_list[i]);
|
||||||
|
if (src >= addr && src < endr) {
|
||||||
|
buf = &std::get<2>(vk_buf_list[i]);
|
||||||
|
buf_offset = ((const uint8_t *)src) - addr;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
VmaAllocationCreateInfo staging_alloc_create_info = {};
|
if (buf != nullptr) {
|
||||||
staging_alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO;
|
// Memory is pinned, use as staging buffer
|
||||||
staging_alloc_create_info.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
|
|
||||||
VMA_ALLOCATION_CREATE_MAPPED_BIT;
|
|
||||||
|
|
||||||
VkBuffer staging_buf;
|
|
||||||
VmaAllocation staging_alloc;
|
|
||||||
VmaAllocationInfo staging_alloc_info;
|
|
||||||
vmaCreateBuffer(vk_allocator,
|
|
||||||
&staging_buf_create_info,
|
|
||||||
&staging_alloc_create_info,
|
|
||||||
&staging_buf,
|
|
||||||
&staging_alloc,
|
|
||||||
&staging_alloc_info);
|
|
||||||
|
|
||||||
// [Executed in runtime]:
|
|
||||||
memcpy(staging_alloc_info.pMappedData + offset, src, size);
|
|
||||||
vmaFlushAllocation(vk_allocator, staging_alloc, 0, VK_WHOLE_SIZE);
|
|
||||||
//vkCmdPipelineBarrier: VK_ACCESS_HOST_WRITE_BIT --> VK_ACCESS_TRANSFER_READ_BIT
|
|
||||||
VkBufferCopy buf_copy = {
|
VkBufferCopy buf_copy = {
|
||||||
0, // srcOffset
|
buf_offset, // srcOffset
|
||||||
0, // dstOffset,
|
offset, // dstOffset,
|
||||||
size}; // size
|
size}; // size
|
||||||
|
|
||||||
vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create();
|
vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_transfer);
|
||||||
vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
||||||
cmd_buffer.begin(cmd_buffer_begin_info);
|
cmd_buffer.begin(cmd_buffer_begin_info);
|
||||||
vkCmdCopyBuffer(cmd_buffer, staging_buf, dst->buffer, 1, &buf_copy);
|
vkCmdCopyBuffer(cmd_buffer, buf->buffer, dst->buffer, 1, &buf_copy);
|
||||||
cmd_buffer.end();
|
cmd_buffer.end();
|
||||||
|
|
||||||
vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0);
|
vk::Queue queue = vk_device.getQueue(vk_transfer_queue_family_index, 0);
|
||||||
vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
|
vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
|
||||||
|
|
||||||
vk::SubmitInfo submit_info(0,
|
vk::SubmitInfo submit_info(0,
|
||||||
|
@ -303,13 +390,45 @@ static void ggml_vk_buffer_write(vk_buffer* dst, size_t offset, const void * src
|
||||||
vk_device.waitForFences({ fence },
|
vk_device.waitForFences({ fence },
|
||||||
true,
|
true,
|
||||||
uint64_t(-1));
|
uint64_t(-1));
|
||||||
vmaDestroyBuffer(vk_allocator, staging_buf, staging_alloc);
|
vk_device.destroyFence(fence);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Staging buffer required
|
||||||
|
vk_buffer staging_buf = ggml_vk_create_buffer(size, VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, VMA_MEMORY_USAGE_AUTO, 0);
|
||||||
|
|
||||||
|
memcpy(((uint8_t *)staging_buf.info.pMappedData) + offset, src, size);
|
||||||
|
vmaFlushAllocation(vk_allocator, staging_buf.allocation, 0, VK_WHOLE_SIZE);
|
||||||
|
VkBufferCopy buf_copy = {
|
||||||
|
0, // srcOffset
|
||||||
|
offset, // dstOffset,
|
||||||
|
size}; // size
|
||||||
|
|
||||||
|
vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_transfer);
|
||||||
|
vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
||||||
|
cmd_buffer.begin(cmd_buffer_begin_info);
|
||||||
|
vkCmdCopyBuffer(cmd_buffer, staging_buf.buffer, dst->buffer, 1, &buf_copy);
|
||||||
|
cmd_buffer.end();
|
||||||
|
|
||||||
|
vk::Queue queue = vk_device.getQueue(vk_transfer_queue_family_index, 0);
|
||||||
|
vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
|
||||||
|
|
||||||
|
vk::SubmitInfo submit_info(0,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
1,
|
||||||
|
&cmd_buffer);
|
||||||
|
queue.submit({ submit_info }, fence);
|
||||||
|
vk_device.waitForFences({ fence },
|
||||||
|
true,
|
||||||
|
uint64_t(-1));
|
||||||
|
|
||||||
|
vk_device.destroyFence(fence);
|
||||||
|
ggml_vk_destroy_buffer(staging_buf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size) {
|
static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_t size) {
|
||||||
vk::CommandBuffer cmd_buf = ggml_vk_cmd_buffer_create();
|
|
||||||
|
|
||||||
VkMemoryPropertyFlags mem_prop_flags;
|
VkMemoryPropertyFlags mem_prop_flags;
|
||||||
vmaGetAllocationMemoryProperties(vk_allocator, src->allocation, &mem_prop_flags);
|
vmaGetAllocationMemoryProperties(vk_allocator, src->allocation, &mem_prop_flags);
|
||||||
|
|
||||||
|
@ -319,40 +438,33 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
|
||||||
}
|
}
|
||||||
memcpy(dst, src->info.pMappedData, size);
|
memcpy(dst, src->info.pMappedData, size);
|
||||||
} else {
|
} else {
|
||||||
// Allocation ended up in a non-mappable memory - need to transfer.
|
// Check if dst is pinned memory
|
||||||
VkBufferCreateInfo staging_buf_create_info = { VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO };
|
vk_buffer* buf = nullptr;
|
||||||
staging_buf_create_info.size = size;
|
size_t buf_offset = 0;
|
||||||
staging_buf_create_info.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT;
|
for (size_t i = 0; i < vk_buf_list.size(); i++) {
|
||||||
|
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_buf_list[i]);
|
||||||
|
const uint8_t* endr = addr + std::get<1>(vk_buf_list[i]);
|
||||||
|
if (dst >= addr && dst < endr) {
|
||||||
|
buf = &std::get<2>(vk_buf_list[i]);
|
||||||
|
buf_offset = ((const uint8_t *)dst) - addr;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
VmaAllocationCreateInfo staging_alloc_create_info = {};
|
if (buf != nullptr) {
|
||||||
staging_alloc_create_info.usage = VMA_MEMORY_USAGE_AUTO;
|
// Memory is pinned, use as staging buffer
|
||||||
staging_alloc_create_info.flags = VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT |
|
|
||||||
VMA_ALLOCATION_CREATE_MAPPED_BIT;
|
|
||||||
|
|
||||||
VkBuffer staging_buf;
|
|
||||||
VmaAllocation staging_alloc;
|
|
||||||
VmaAllocationInfo staging_alloc_info;
|
|
||||||
vmaCreateBuffer(vk_allocator,
|
|
||||||
&staging_buf_create_info,
|
|
||||||
&staging_alloc_create_info,
|
|
||||||
&staging_buf,
|
|
||||||
&staging_alloc,
|
|
||||||
&staging_alloc_info);
|
|
||||||
|
|
||||||
//vkCmdPipelineBarrier: VK_ACCESS_HOST_WRITE_BIT --> VK_ACCESS_TRANSFER_READ_BIT
|
|
||||||
VkBufferCopy buf_copy = {
|
VkBufferCopy buf_copy = {
|
||||||
offset, // srcOffset
|
offset, // srcOffset
|
||||||
0, // dstOffset,
|
buf_offset, // dstOffset,
|
||||||
size}; // size
|
size}; // size
|
||||||
vmaInvalidateAllocation(vk_allocator, staging_alloc, 0, VK_WHOLE_SIZE);
|
|
||||||
|
|
||||||
vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create();
|
vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_transfer);
|
||||||
vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
||||||
cmd_buffer.begin(cmd_buffer_begin_info);
|
cmd_buffer.begin(cmd_buffer_begin_info);
|
||||||
vkCmdCopyBuffer(cmd_buffer, src->buffer, staging_buf, 1, &buf_copy);
|
vkCmdCopyBuffer(cmd_buffer, src->buffer, buf->buffer, 1, &buf_copy);
|
||||||
cmd_buffer.end();
|
cmd_buffer.end();
|
||||||
|
|
||||||
vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0);
|
vk::Queue queue = vk_device.getQueue(vk_transfer_queue_family_index, 0);
|
||||||
vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
|
vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
|
||||||
|
|
||||||
vk::SubmitInfo submit_info(0,
|
vk::SubmitInfo submit_info(0,
|
||||||
|
@ -364,8 +476,39 @@ static void ggml_vk_buffer_read(vk_buffer* src, size_t offset, void * dst, size_
|
||||||
vk_device.waitForFences({ fence },
|
vk_device.waitForFences({ fence },
|
||||||
true,
|
true,
|
||||||
uint64_t(-1));
|
uint64_t(-1));
|
||||||
memcpy(dst, staging_alloc_info.pMappedData, size);
|
vk_device.destroyFence(fence);
|
||||||
vmaDestroyBuffer(vk_allocator, staging_buf, staging_alloc);
|
return;
|
||||||
|
}
|
||||||
|
vk_buffer staging_buf = ggml_vk_create_buffer(size, VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT, VMA_MEMORY_USAGE_AUTO, 0);
|
||||||
|
|
||||||
|
VkBufferCopy buf_copy = {
|
||||||
|
offset, // srcOffset
|
||||||
|
0, // dstOffset,
|
||||||
|
size}; // size
|
||||||
|
vmaInvalidateAllocation(vk_allocator, staging_buf.allocation, 0, VK_WHOLE_SIZE);
|
||||||
|
|
||||||
|
vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_transfer);
|
||||||
|
vk::CommandBufferBeginInfo cmd_buffer_begin_info(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
||||||
|
cmd_buffer.begin(cmd_buffer_begin_info);
|
||||||
|
vkCmdCopyBuffer(cmd_buffer, src->buffer, staging_buf.buffer, 1, &buf_copy);
|
||||||
|
cmd_buffer.end();
|
||||||
|
|
||||||
|
vk::Queue queue = vk_device.getQueue(vk_transfer_queue_family_index, 0);
|
||||||
|
vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
|
||||||
|
|
||||||
|
vk::SubmitInfo submit_info(0,
|
||||||
|
nullptr,
|
||||||
|
nullptr,
|
||||||
|
1,
|
||||||
|
&cmd_buffer);
|
||||||
|
queue.submit({ submit_info }, fence);
|
||||||
|
vk_device.waitForFences({ fence },
|
||||||
|
true,
|
||||||
|
uint64_t(-1));
|
||||||
|
memcpy(dst, staging_buf.info.pMappedData, size);
|
||||||
|
|
||||||
|
vk_device.destroyFence(fence);
|
||||||
|
ggml_vk_destroy_buffer(staging_buf);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -424,10 +567,10 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
if (src0->backend == GGML_BACKEND_GPU) {
|
if (src0->backend == GGML_BACKEND_GPU) {
|
||||||
d_X = *(vk_buffer*) src0->data;
|
d_X = *(vk_buffer*) src0->data;
|
||||||
} else {
|
} else {
|
||||||
ggml_vk_pool_malloc(ggml_type_size(src0->type) * x_ne, &d_X);
|
ggml_vk_pool_malloc(ggml_type_size(src0->type) * x_ne, &d_X, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT);
|
||||||
}
|
}
|
||||||
ggml_vk_pool_malloc(sizeof(float) * y_ne, &d_Y);
|
ggml_vk_pool_malloc(sizeof(float) * y_ne, &d_Y, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_SEQUENTIAL_WRITE_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT);
|
||||||
ggml_vk_pool_malloc(sizeof(float) * d_ne, &d_D);
|
ggml_vk_pool_malloc(sizeof(float) * d_ne, &d_D, VMA_ALLOCATION_CREATE_DEDICATED_MEMORY_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_RANDOM_BIT | VMA_ALLOCATION_CREATE_HOST_ACCESS_ALLOW_TRANSFER_INSTEAD_BIT | VMA_ALLOCATION_CREATE_MAPPED_BIT);
|
||||||
|
|
||||||
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, 3);
|
vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, 3);
|
||||||
vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(), 1, descriptor_pool_size);
|
vk::DescriptorPoolCreateInfo descriptor_pool_create_info(vk::DescriptorPoolCreateFlags(), 1, descriptor_pool_size);
|
||||||
|
@ -450,7 +593,8 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
std::array<int, 6> push_constants = { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 };
|
std::array<int, 6> push_constants = { (int)ne01, (int)ne11, (int)ne10, (int)ne00, (int)ne10, (int)ne01 };
|
||||||
assert( ( sizeof( push_constants ) <= vk_physical_device.getProperties().limits.maxPushConstantsSize ) && "Too many push constants" );
|
assert( ( sizeof( push_constants ) <= vk_physical_device.getProperties().limits.maxPushConstantsSize ) && "Too many push constants" );
|
||||||
|
|
||||||
vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create();
|
vk::CommandBuffer cmd_buffer = ggml_vk_cmd_buffer_create(vk_command_pool_compute);
|
||||||
|
vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
|
@ -476,7 +620,6 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
cmd_buffer.end();
|
cmd_buffer.end();
|
||||||
|
|
||||||
vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0);
|
vk::Queue queue = vk_device.getQueue(vk_compute_queue_family_index, 0);
|
||||||
vk::Fence fence = vk_device.createFence(vk::FenceCreateInfo());
|
|
||||||
|
|
||||||
vk::SubmitInfo submit_info(0,
|
vk::SubmitInfo submit_info(0,
|
||||||
nullptr,
|
nullptr,
|
||||||
|
@ -496,7 +639,7 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||||
ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne);
|
ggml_vk_buffer_read(&d_D, 0, d, sizeof(float) * d_ne);
|
||||||
|
|
||||||
#if 0
|
#ifdef VK_CHK_KERNEL
|
||||||
const float * x = (float *) ((char *) src0->data);
|
const float * x = (float *) ((char *) src0->data);
|
||||||
const float * y = (float *) ((char *) src1->data);
|
const float * y = (float *) ((char *) src1->data);
|
||||||
float * d_chk = (float *) malloc(sizeof(float) * d_ne);
|
float * d_chk = (float *) malloc(sizeof(float) * d_ne);
|
||||||
|
@ -519,11 +662,13 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vk_device.destroyFence(fence);
|
||||||
|
|
||||||
if (src0->backend != GGML_BACKEND_GPU) {
|
if (src0->backend != GGML_BACKEND_GPU) {
|
||||||
ggml_vk_pool_free(&d_X);
|
ggml_vk_pool_free(d_X);
|
||||||
}
|
}
|
||||||
ggml_vk_pool_free(&d_Y);
|
ggml_vk_pool_free(d_Y);
|
||||||
ggml_vk_pool_free(&d_D);
|
ggml_vk_pool_free(d_D);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_vk_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
|
46
llama-util.h
46
llama-util.h
|
@ -497,6 +497,52 @@ struct llama_ctx_buffer {
|
||||||
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
||||||
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
||||||
};
|
};
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
|
struct llama_ctx_buffer {
|
||||||
|
uint8_t * addr = NULL;
|
||||||
|
bool is_vk;
|
||||||
|
size_t size = 0;
|
||||||
|
|
||||||
|
llama_ctx_buffer() = default;
|
||||||
|
|
||||||
|
void resize(size_t size) {
|
||||||
|
free();
|
||||||
|
|
||||||
|
addr = (uint8_t *) ggml_vk_host_malloc(size);
|
||||||
|
if (addr) {
|
||||||
|
is_vk = true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// fall back to pageable memory
|
||||||
|
addr = new uint8_t[size];
|
||||||
|
is_vk = false;
|
||||||
|
}
|
||||||
|
this->size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
void free() {
|
||||||
|
if (addr) {
|
||||||
|
if (is_vk) {
|
||||||
|
ggml_vk_host_free(addr);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
delete[] addr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
addr = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
~llama_ctx_buffer() {
|
||||||
|
free();
|
||||||
|
}
|
||||||
|
|
||||||
|
// disable copy and move
|
||||||
|
llama_ctx_buffer(const llama_ctx_buffer&) = delete;
|
||||||
|
llama_ctx_buffer(llama_ctx_buffer&&) = delete;
|
||||||
|
llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
|
||||||
|
llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
|
||||||
|
};
|
||||||
#else
|
#else
|
||||||
typedef llama_buffer llama_ctx_buffer;
|
typedef llama_buffer llama_ctx_buffer;
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue