Clean up unused functions
Add repeat op
This commit is contained in:
parent
3de5ba4946
commit
0230981649
1 changed files with 88 additions and 178 deletions
266
ggml-vulkan.cpp
266
ggml-vulkan.cpp
|
@ -1404,115 +1404,6 @@ static inline size_t ggml_vk_align_size(size_t width, size_t align) {
|
|||
return CEIL_DIV(width, align) * align;
|
||||
}
|
||||
|
||||
static vk_sequence ggml_vk_buffer_write_2d_async_zeropad(vk_buffer* dst, size_t offset, const void * src, size_t spitch, size_t width, size_t height, size_t align, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores, vk_submission* s = nullptr) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_2d_async_zeropad(" << offset << ", " << spitch << ", " << width << ", " << height << ", " << align << ")" << std::endl;
|
||||
#endif
|
||||
// Outdated
|
||||
GGML_ASSERT(false);
|
||||
// Buffer is already mapped
|
||||
if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
|
||||
std::cerr << "ggml_vulkan: buffer_write_2d_async_zeropad dst buffer is host_visible. Use synchronous write." << std::endl;
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
// Check if src is pinned memory
|
||||
vk_buffer* buf = nullptr;
|
||||
size_t buf_offset = 0;
|
||||
for (size_t i = 0; i < vk_pinned_memory.size(); i++) {
|
||||
const uint8_t* addr = (const uint8_t*) std::get<0>(vk_pinned_memory[i]);
|
||||
const uint8_t* endr = addr + std::get<1>(vk_pinned_memory[i]);
|
||||
if (src >= addr && src < endr) {
|
||||
buf = &std::get<2>(vk_pinned_memory[i]);
|
||||
buf_offset = ((const uint8_t *)src) - addr;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Align slices to the value of align
|
||||
const uint32_t padded_width = ggml_vk_align_size(width, align);
|
||||
|
||||
bool reuse_submission = false;
|
||||
vk_submission submission;
|
||||
if (s == nullptr) {
|
||||
submission = ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores));
|
||||
s = &submission;
|
||||
reuse_submission = true;
|
||||
}
|
||||
|
||||
if (buf != nullptr) {
|
||||
std::vector<vk::BufferCopy> slices(1);
|
||||
if (width == padded_width && width == spitch) {
|
||||
// Only do single write if no padding happens
|
||||
slices[0].srcOffset = buf_offset;
|
||||
slices[0].dstOffset = offset;
|
||||
slices[0].size = width * height;
|
||||
} else {
|
||||
slices.resize(height);
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
slices[i].srcOffset = buf_offset + i * spitch;
|
||||
slices[i].dstOffset = offset + i * padded_width;
|
||||
slices[i].size = width;
|
||||
}
|
||||
}
|
||||
|
||||
if (reuse_submission) {
|
||||
s->buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||
}
|
||||
ggml_vk_sync_buffers(s->buffer, { ggml_vk_subbuffer(*dst) }, q, vk::AccessFlagBits::eMemoryRead, vk::AccessFlagBits::eMemoryWrite, false);
|
||||
if (padded_width > width) {
|
||||
s->buffer.fillBuffer(dst->buffer, 0, VK_WHOLE_SIZE, 0);
|
||||
}
|
||||
s->buffer.pipelineBarrier(
|
||||
q.stage_flags,
|
||||
q.stage_flags,
|
||||
{},
|
||||
{},
|
||||
{
|
||||
{ vk::AccessFlagBits::eMemoryWrite, vk::AccessFlagBits::eMemoryWrite, VK_QUEUE_FAMILY_IGNORED, VK_QUEUE_FAMILY_IGNORED, dst->buffer, 0, VK_WHOLE_SIZE }
|
||||
},
|
||||
{}
|
||||
);
|
||||
s->buffer.copyBuffer(buf->buffer, dst->buffer, slices);
|
||||
if (reuse_submission) {
|
||||
s->buffer.end();
|
||||
}
|
||||
return { *s };
|
||||
}
|
||||
|
||||
// Staging buffer required, malloc because of async transfer
|
||||
if (dst->sb_write == nullptr) {
|
||||
dst->sb_write = new vk_buffer;
|
||||
*dst->sb_write = ggml_vk_create_buffer(dst->size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
}
|
||||
|
||||
vk::BufferCopy buf_copy = {
|
||||
0,
|
||||
offset,
|
||||
padded_width * height};
|
||||
|
||||
if (reuse_submission) {
|
||||
s->buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||
}
|
||||
ggml_vk_sync_buffers(s->buffer, { ggml_vk_subbuffer(*dst) }, q, vk::AccessFlagBits::eMemoryRead, vk::AccessFlagBits::eTransferWrite, false);
|
||||
s->buffer.copyBuffer(dst->sb_write->buffer, dst->buffer, { buf_copy });
|
||||
if (reuse_submission) {
|
||||
s->buffer.end();
|
||||
}
|
||||
|
||||
const size_t zeropad = padded_width - width;
|
||||
|
||||
if (width == padded_width && width == spitch) {
|
||||
memcpy(dst->sb_write->ptr, src, width * height);
|
||||
} else {
|
||||
for (size_t i = 0; i < height; i++) {
|
||||
memcpy((uint8_t *)dst->sb_write->ptr + i * padded_width, (const uint8_t *) src + i * spitch, width);
|
||||
memset((uint8_t *)dst->sb_write->ptr + i * padded_width + width, 0, zeropad);
|
||||
}
|
||||
}
|
||||
|
||||
return { *s };
|
||||
}
|
||||
|
||||
static vk_sequence ggml_vk_buffer_write_async(vk_buffer* dst, size_t offset, const void * src, size_t size, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores, vk_submission* s = nullptr, std::vector<vk_staging_memcpy>* pre_staging = nullptr) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_buffer_write_async(" << size << ")" << std::endl;
|
||||
|
@ -1678,73 +1569,6 @@ static vk_sequence ggml_vk_h2d_tensor_2d(vk_buffer* dst, size_t offset, const st
|
|||
}
|
||||
}
|
||||
|
||||
static vk_sequence ggml_vk_h2d_tensor_2d_f32_to_f16(vk_buffer* dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, vk_queue& q, std::vector<vk::Semaphore> wait_semaphores, std::vector<vk::Semaphore> signal_semaphores) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_h2d_tensor_2d()" << std::endl;
|
||||
#endif
|
||||
GGML_ASSERT(src->type == GGML_TYPE_F32);
|
||||
|
||||
const uint64_t ne0 = src->ne[0];
|
||||
const uint64_t ne1 = src->ne[1];
|
||||
const uint64_t nb0 = src->nb[0];
|
||||
const uint64_t nb1 = src->nb[1];
|
||||
const uint64_t nb2 = src->nb[2];
|
||||
const uint64_t nb3 = src->nb[3];
|
||||
const enum ggml_type type = src->type;
|
||||
const size_t ts = ggml_type_size(type);
|
||||
const size_t bs = ggml_blck_size(type);
|
||||
const size_t row_length = ts*ne0/bs;
|
||||
|
||||
const uint32_t copy_size = sizeof(ggml_fp16_t) * ne0 * ne1;
|
||||
|
||||
if (dst->sb_write == nullptr) {
|
||||
dst->sb_write = new vk_buffer;
|
||||
*dst->sb_write = ggml_vk_create_buffer(dst->size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
|
||||
}
|
||||
|
||||
ggml_fp16_t * tmp = (ggml_fp16_t *) ((uint8_t *) dst->sb_write->ptr + offset);
|
||||
const uint8_t * x = (const uint8_t *) src->data + i2*nb2 + i3*nb3;
|
||||
if (nb0 == ts && nb1 == row_length) {
|
||||
ggml_fp32_to_fp16_row((const float *) x, tmp, ne0*ne1);
|
||||
|
||||
vk_submission s = ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores));
|
||||
|
||||
vk::BufferCopy buf_copy = {
|
||||
offset,
|
||||
offset,
|
||||
copy_size,
|
||||
};
|
||||
|
||||
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||
ggml_vk_sync_buffers(s.buffer, { { *dst, (uint32_t)offset, copy_size } }, q, vk::AccessFlagBits::eMemoryRead, vk::AccessFlagBits::eTransferWrite, false);
|
||||
s.buffer.copyBuffer(dst->sb_write->buffer, dst->buffer, { buf_copy });
|
||||
s.buffer.end();
|
||||
|
||||
return { s };
|
||||
}
|
||||
if (nb0 == ts) {
|
||||
for (uint64_t i1 = 0; i1 < ne1; i1++) {
|
||||
ggml_fp32_to_fp16_row((const float *) (x + i1*nb1), tmp + i1*ne0, ne0);
|
||||
}
|
||||
|
||||
vk_submission s = ggml_vk_create_submission(q, std::move(wait_semaphores), std::move(signal_semaphores));
|
||||
|
||||
vk::BufferCopy buf_copy = {
|
||||
offset,
|
||||
offset,
|
||||
copy_size,
|
||||
};
|
||||
|
||||
s.buffer.begin({ vk::CommandBufferUsageFlagBits::eOneTimeSubmit });
|
||||
ggml_vk_sync_buffers(s.buffer, { { *dst, (uint32_t)offset, copy_size } }, q, vk::AccessFlagBits::eMemoryRead, vk::AccessFlagBits::eTransferWrite, false);
|
||||
s.buffer.copyBuffer(dst->sb_write->buffer, dst->buffer, { buf_copy });
|
||||
s.buffer.end();
|
||||
|
||||
return { s };
|
||||
}
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
||||
static int ggml_vk_guess_split_k(int m, int n, int k) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_guess_split_k()";
|
||||
|
@ -2336,6 +2160,76 @@ static void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_t
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_op_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
// guaranteed to be an integer due to the check in ggml_can_repeat
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
const int64_t ne1 = dst->ne[1];
|
||||
const int64_t ne2 = dst->ne[2];
|
||||
const int64_t ne3 = dst->ne[3];
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
|
||||
const size_t nb0 = dst->nb[0];
|
||||
const size_t nb1 = dst->nb[1];
|
||||
const size_t nb2 = dst->nb[2];
|
||||
const size_t nb3 = dst->nb[3];
|
||||
|
||||
const size_t nb00 = src0->nb[0];
|
||||
const size_t nb01 = src0->nb[1];
|
||||
const size_t nb02 = src0->nb[2];
|
||||
const size_t nb03 = src0->nb[3];
|
||||
|
||||
const int nr0 = (int)(ne0/ne00);
|
||||
const int nr1 = (int)(ne1/ne01);
|
||||
const int nr2 = (int)(ne2/ne02);
|
||||
const int nr3 = (int)(ne3/ne03);
|
||||
|
||||
// TODO: support for transposed / permuted tensors
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
GGML_ASSERT(nb00 == sizeof(float));
|
||||
GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
|
||||
GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
|
||||
|
||||
ggml_vk_tensor_extra_gpu * extra = (ggml_vk_tensor_extra_gpu *) dst->extra;
|
||||
|
||||
const vk_buffer* src0_buf = (vk_buffer *) src0->data;
|
||||
vk_buffer* dst_buf = (vk_buffer *) dst->data;
|
||||
|
||||
std::vector<VkBufferCopy> copies;
|
||||
|
||||
// TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
|
||||
for (int i3 = 0; i3 < nr3; i3++) {
|
||||
for (int k3 = 0; k3 < ne03; k3++) {
|
||||
for (int i2 = 0; i2 < nr2; i2++) {
|
||||
for (int k2 = 0; k2 < ne02; k2++) {
|
||||
for (int i1 = 0; i1 < nr1; i1++) {
|
||||
for (int k1 = 0; k1 < ne01; k1++) {
|
||||
for (int i0 = 0; i0 < nr0; i0++) {
|
||||
copies.push_back({
|
||||
(i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
|
||||
( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
|
||||
ne00*nb0,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vk_submission s = ggml_vk_begin_submission(vk_device.transfer_queues[0]);
|
||||
vkCmdCopyBuffer(s.buffer, src0_buf->buffer, dst_buf->buffer, copies.size(), copies.data());
|
||||
ggml_vk_end_submission(s, {}, {});
|
||||
extra->out_seqs.push_back({ s });
|
||||
|
||||
(void) src1;
|
||||
}
|
||||
|
||||
|
||||
static vk_pipeline* ggml_vk_op_get_pipeline(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) {
|
||||
switch (op) {
|
||||
case GGML_OP_ADD:
|
||||
|
@ -2360,6 +2254,15 @@ static vk_pipeline* ggml_vk_op_get_pipeline(const ggml_tensor * src0, const ggml
|
|||
}
|
||||
}
|
||||
|
||||
static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) {
|
||||
switch(op) {
|
||||
case GGML_OP_REPEAT:
|
||||
return ggml_vk_op_repeat;
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op, float scale=1.0f) {
|
||||
#ifdef VK_DEBUG
|
||||
std::cerr << "ggml_vk_op_f32((type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3];
|
||||
|
@ -2387,9 +2290,16 @@ static void ggml_vk_op_f32(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|||
GGML_ASSERT(dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] == ne0);
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
vk_pipeline* pipeline = ggml_vk_op_get_pipeline(src0, src1, dst, op);
|
||||
vk_pipeline * pipeline = ggml_vk_op_get_pipeline(src0, src1, dst, op);
|
||||
ggml_vk_func_t op_func;
|
||||
|
||||
GGML_ASSERT(pipeline != nullptr);
|
||||
if (pipeline == nullptr) {
|
||||
op_func = ggml_vk_op_get_func(op);
|
||||
GGML_ASSERT(op_func != nullptr);
|
||||
|
||||
op_func(src0, src1, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU;
|
||||
const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue