Minor fixes
This commit is contained in:
parent
dad1cdb1ef
commit
e2962e1262
2 changed files with 5 additions and 5 deletions
|
@ -585,7 +585,7 @@ void main() {
|
||||||
const int is = 0;
|
const int is = 0;
|
||||||
#else
|
#else
|
||||||
const int l0 = 4 * v_in; // 0, 4, 8, ..., 28
|
const int l0 = 4 * v_in; // 0, 4, 8, ..., 28
|
||||||
const int is = in / 4;
|
const int is = v_in / 4;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const int ql_offset = 64*v_im + l0;
|
const int ql_offset = 64*v_im + l0;
|
||||||
|
@ -611,7 +611,7 @@ void main() {
|
||||||
+ FLOAT_TYPE(y[y_idx +112]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 7]) * d * FLOAT_TYPE(int8_t((x[ib0 + i].ql[ql_offset + 48] >> 4) | ((x[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32);
|
+ FLOAT_TYPE(y[y_idx +112]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 7]) * d * FLOAT_TYPE(int8_t((x[ib0 + i].ql[ql_offset + 48] >> 4) | ((x[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32);
|
||||||
tmp[16 * ix + tid] += sum;
|
tmp[16 * ix + tid] += sum;
|
||||||
#else
|
#else
|
||||||
FLOAT_TYPE sum = 0;
|
FLOAT_TYPE sum = FLOAT_TYPE(0.0);
|
||||||
for (int l = 0; l < 4; ++l) {
|
for (int l = 0; l < 4; ++l) {
|
||||||
sum += FLOAT_TYPE(y[y_idx + l+ 0]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((x[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((x[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32)
|
sum += FLOAT_TYPE(y[y_idx + l+ 0]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((x[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((x[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32)
|
||||||
+ FLOAT_TYPE(y[y_idx + l+32]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((x[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((x[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32)
|
+ FLOAT_TYPE(y[y_idx + l+32]) * FLOAT_TYPE(x[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((x[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((x[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32)
|
||||||
|
|
|
@ -7,9 +7,9 @@
|
||||||
|
|
||||||
#ifdef VK_PROFILE
|
#ifdef VK_PROFILE
|
||||||
#define PROFILE(name, block) do { \
|
#define PROFILE(name, block) do { \
|
||||||
auto begin = std::chrono::high_resolution_clock::now(); \
|
auto begin = std::chrono::steady_clock::now(); \
|
||||||
block \
|
block \
|
||||||
auto end = std::chrono::high_resolution_clock::now(); \
|
auto end = std::chrono::steady_clock::now(); \
|
||||||
double time_taken = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0; \
|
double time_taken = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0; \
|
||||||
printf("%s: %lf ms\n", name, time_taken); \
|
printf("%s: %lf ms\n", name, time_taken); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
@ -914,7 +914,7 @@ std::cerr << "ggml_vulkan: Validation layers enabled" << std::endl;
|
||||||
|
|
||||||
// Try to find a non-graphics compute queue and transfer-focused queues
|
// Try to find a non-graphics compute queue and transfer-focused queues
|
||||||
const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1);
|
const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1);
|
||||||
const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics | vk::QueueFlagBits::eVideoDecodeKHR | vk::QueueFlagBits::eProtected | vk::QueueFlagBits::eOpticalFlowNV, compute_queue_family_index, 2);
|
const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 2);
|
||||||
|
|
||||||
uint32_t transfer_queue_count = VK_TRANSFER_QUEUE_COUNT;
|
uint32_t transfer_queue_count = VK_TRANSFER_QUEUE_COUNT;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue