Merge branch 'ggerganov:master' into server-chat-templates
This commit is contained in:
commit
7a24341e1b
10 changed files with 182 additions and 99 deletions
|
@ -1,6 +1,6 @@
|
|||
ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
|
||||
|
||||
FROM cosdt/cann:$ASCEND_VERSION AS build
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS build
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
|
@ -26,7 +26,7 @@ RUN echo "Building with static libs" && \
|
|||
cmake --build build --config Release --target llama-cli
|
||||
|
||||
# TODO: use image with NNRT
|
||||
FROM cosdt/cann:$ASCEND_VERSION AS runtime
|
||||
FROM ascendai/cann:$ASCEND_VERSION AS runtime
|
||||
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||
ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
|
||||
|
||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
|
||||
|
||||
|
|
21
.github/workflows/build.yml
vendored
21
.github/workflows/build.yml
vendored
|
@ -414,6 +414,27 @@ jobs:
|
|||
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
|
||||
cmake --build build2 --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-musa:
|
||||
runs-on: ubuntu-22.04
|
||||
container: mthreads/musa:rc3.1.0-devel-ubuntu22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Dependencies
|
||||
id: depends
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||
|
||||
- name: Build with native CMake MUSA support
|
||||
id: cmake_build
|
||||
run: |
|
||||
cmake -B build -S . -DGGML_MUSA=ON
|
||||
cmake --build build --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-sycl:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
|
|
|
@ -375,7 +375,7 @@ cmake --build build --config release
|
|||
|
||||
You can test with:
|
||||
|
||||
`./build/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
|
||||
`./build/bin/llama-cli -m PATH_TO_MODEL -p "Building a website can be done in 10 steps:" -ngl 32`
|
||||
|
||||
If the fllowing info is output on screen, you are using `llama.cpp by CANN backend`:
|
||||
```bash
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
.markdown {
|
||||
h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; }
|
||||
pre {
|
||||
@apply whitespace-pre-wrap my-4 rounded-lg p-2;
|
||||
@apply whitespace-pre-wrap rounded-lg p-2;
|
||||
border: 1px solid currentColor;
|
||||
}
|
||||
/* TODO: fix markdown table */
|
||||
|
@ -25,8 +25,11 @@
|
|||
.bg-base-200 {background-color: var(--fallback-b2,oklch(var(--b2)/1))}
|
||||
.bg-base-300 {background-color: var(--fallback-b3,oklch(var(--b3)/1))}
|
||||
.text-base-content {color: var(--fallback-bc,oklch(var(--bc)/1))}
|
||||
.show-on-hover {
|
||||
@apply opacity-0 group-hover:opacity-100;
|
||||
}
|
||||
.btn-mini {
|
||||
@apply cursor-pointer opacity-0 group-hover:opacity-100 hover:shadow-md;
|
||||
@apply cursor-pointer hover:shadow-md;
|
||||
}
|
||||
.chat-screen { max-width: 900px; }
|
||||
/* because the default bubble color is quite dark, we will make a custom one using bg-base-300 */
|
||||
|
@ -152,14 +155,14 @@
|
|||
<!-- actions for each message -->
|
||||
<div :class="{'text-right': msg.role === 'user'}" class="mx-4 mt-2 mb-2">
|
||||
<!-- user message -->
|
||||
<button v-if="msg.role === 'user'" class="badge btn-mini" @click="editingMsg = msg" :disabled="isGenerating">
|
||||
<button v-if="msg.role === 'user'" class="badge btn-minishow-on-hover " @click="editingMsg = msg" :disabled="isGenerating">
|
||||
✍️ Edit
|
||||
</button>
|
||||
<!-- assistant message -->
|
||||
<button v-if="msg.role === 'assistant'" class="badge btn-mini mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
|
||||
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="regenerateMsg(msg)" :disabled="isGenerating">
|
||||
🔄 Regenerate
|
||||
</button>
|
||||
<button v-if="msg.role === 'assistant'" class="badge btn-mini mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
|
||||
<button v-if="msg.role === 'assistant'" class="badge btn-mini show-on-hover mr-2" @click="copyMsg(msg)" :disabled="isGenerating">
|
||||
📋 Copy
|
||||
</button>
|
||||
</div>
|
||||
|
@ -196,12 +199,13 @@
|
|||
<h3 class="text-lg font-bold mb-6">Settings</h3>
|
||||
<div class="h-[calc(90vh-12rem)] overflow-y-auto">
|
||||
<p class="opacity-40 mb-6">Settings below are saved in browser's localStorage</p>
|
||||
<settings-modal-short-input :config-key="'apiKey'" :config-default="configDefault" :config-info="configInfo" v-model="config.apiKey"></settings-modal-short-input>
|
||||
<label class="form-control mb-2">
|
||||
<div class="label">System Message</div>
|
||||
<textarea class="textarea textarea-bordered h-24" :placeholder="'Default: ' + configDefault.systemMessage" v-model="config.systemMessage"></textarea>
|
||||
</label>
|
||||
<template v-for="configKey in ['temperature', 'top_k', 'top_p', 'min_p', 'max_tokens']">
|
||||
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
|
||||
<settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
|
||||
</template>
|
||||
<!-- TODO: add more sampling-related configs, please regroup them into different "collapse" sections -->
|
||||
<!-- Section: Other sampler settings -->
|
||||
|
@ -209,7 +213,7 @@
|
|||
<summary class="collapse-title font-bold">Other sampler settings</summary>
|
||||
<div class="collapse-content">
|
||||
<template v-for="configKey in ['dynatemp_range', 'dynatemp_exponent', 'typical_p', 'xtc_probability', 'xtc_threshold']">
|
||||
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
|
||||
<settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
|
||||
</template>
|
||||
</div>
|
||||
</details>
|
||||
|
@ -218,7 +222,7 @@
|
|||
<summary class="collapse-title font-bold">Penalties settings</summary>
|
||||
<div class="collapse-content">
|
||||
<template v-for="configKey in ['repeat_last_n', 'repeat_penalty', 'presence_penalty', 'frequency_penalty', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_penalty_last_n']">
|
||||
<settings-modal-numeric-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
|
||||
<settings-modal-short-input :config-key="configKey" :config-default="configDefault" :config-info="configInfo" v-model="config[configKey]" />
|
||||
</template>
|
||||
</div>
|
||||
</details>
|
||||
|
@ -254,7 +258,7 @@
|
|||
</div>
|
||||
|
||||
<!-- Template to be used by settings modal -->
|
||||
<template id="settings-modal-numeric-input">
|
||||
<template id="settings-modal-short-input">
|
||||
<label class="input input-bordered join-item grow flex items-center gap-2 mb-2">
|
||||
<!-- Show help message on hovering on the input label -->
|
||||
<div class="dropdown dropdown-hover">
|
||||
|
@ -273,9 +277,13 @@
|
|||
import { createApp, defineComponent, shallowRef, computed, h } from './deps_vue.esm-browser.js';
|
||||
import { llama } from './completion.js';
|
||||
|
||||
// utility functions
|
||||
const isString = (x) => !!x.toLowerCase;
|
||||
const isNumeric = (n) => !isString(n) && !isNaN(n);
|
||||
const escapeAttr = (str) => str.replace(/>/g, '>').replace(/"/g, '"');
|
||||
const copyStr = (str) => navigator.clipboard.writeText(str);
|
||||
|
||||
// constants
|
||||
const BASE_URL = localStorage.getItem('base') // for debugging
|
||||
|| (new URL('.', document.baseURI).href).toString(); // for production
|
||||
const CONFIG_DEFAULT = {
|
||||
|
@ -305,7 +313,7 @@
|
|||
custom: '', // custom json-stringified object
|
||||
};
|
||||
const CONFIG_INFO = {
|
||||
apiKey: '',
|
||||
apiKey: 'Set the API Key if you are using --api-key option for the server.',
|
||||
systemMessage: 'The starting message that defines how model should behave.',
|
||||
samplers: 'The order at which samplers are applied, in simplified way. Default is "dkypmxt": dry->top_k->typ_p->top_p->min_p->xtc->temperature',
|
||||
temperature: 'Controls the randomness of the generated text by affecting the probability distribution of the output tokens. Higher = more random, lower = more focused.',
|
||||
|
@ -336,19 +344,28 @@
|
|||
// markdown support
|
||||
const VueMarkdown = defineComponent(
|
||||
(props) => {
|
||||
const md = shallowRef(new markdownit(props.options ?? { breaks: true }));
|
||||
for (const plugin of props.plugins ?? []) {
|
||||
md.value.use(plugin);
|
||||
}
|
||||
const md = shallowRef(new markdownit({ breaks: true }));
|
||||
const origFenchRenderer = md.value.renderer.rules.fence;
|
||||
md.value.renderer.rules.fence = (tokens, idx, ...args) => {
|
||||
const content = tokens[idx].content;
|
||||
const origRendered = origFenchRenderer(tokens, idx, ...args);
|
||||
return `<div class="relative my-4">
|
||||
<div class="text-right sticky top-4 mb-2 mr-2 h-0">
|
||||
<button class="badge btn-mini" onclick="copyStr(${escapeAttr(JSON.stringify(content))})">📋 Copy</button>
|
||||
</div>
|
||||
${origRendered}
|
||||
</div>`;
|
||||
};
|
||||
window.copyStr = copyStr;
|
||||
const content = computed(() => md.value.render(props.source));
|
||||
return () => h("div", { innerHTML: content.value });
|
||||
},
|
||||
{ props: ["source", "options", "plugins"] }
|
||||
{ props: ["source"] }
|
||||
);
|
||||
|
||||
// inout field to be used by settings modal
|
||||
const SettingsModalNumericInput = defineComponent({
|
||||
template: document.getElementById('settings-modal-numeric-input').innerHTML,
|
||||
const SettingsModalShortInput = defineComponent({
|
||||
template: document.getElementById('settings-modal-short-input').innerHTML,
|
||||
props: ['configKey', 'configDefault', 'configInfo', 'modelValue'],
|
||||
});
|
||||
|
||||
|
@ -401,7 +418,11 @@
|
|||
if (!conv) return;
|
||||
const msg = conv.messages.pop();
|
||||
conv.lastModified = Date.now();
|
||||
localStorage.setItem(convId, JSON.stringify(conv));
|
||||
if (conv.messages.length === 0) {
|
||||
StorageUtils.remove(convId);
|
||||
} else {
|
||||
localStorage.setItem(convId, JSON.stringify(conv));
|
||||
}
|
||||
return msg;
|
||||
},
|
||||
|
||||
|
@ -442,7 +463,7 @@
|
|||
const mainApp = createApp({
|
||||
components: {
|
||||
VueMarkdown,
|
||||
SettingsModalNumericInput,
|
||||
SettingsModalShortInput,
|
||||
},
|
||||
data() {
|
||||
return {
|
||||
|
@ -599,6 +620,7 @@
|
|||
this.isGenerating = false;
|
||||
this.stopGeneration = () => {};
|
||||
this.fetchMessages();
|
||||
chatScrollToBottom();
|
||||
},
|
||||
|
||||
// message actions
|
||||
|
@ -612,7 +634,7 @@
|
|||
this.generateMessage(currConvId);
|
||||
},
|
||||
copyMsg(msg) {
|
||||
navigator.clipboard.writeText(msg.content);
|
||||
copyStr(msg.content);
|
||||
},
|
||||
editUserMsgAndRegenerate(msg) {
|
||||
if (this.isGenerating) return;
|
||||
|
|
|
@ -102,6 +102,12 @@ struct server_task_result {
|
|||
bool error;
|
||||
};
|
||||
|
||||
struct server_static_file {
|
||||
const unsigned char * data;
|
||||
unsigned int size;
|
||||
const char * mime_type;
|
||||
};
|
||||
|
||||
struct slot_params {
|
||||
bool stream = true;
|
||||
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
|
||||
|
@ -2267,6 +2273,16 @@ int main(int argc, char ** argv) {
|
|||
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||
LOG_INF("\n");
|
||||
|
||||
// static files
|
||||
std::map<std::string, server_static_file> static_files = {
|
||||
{ "/", { index_html, index_html_len, "text/html; charset=utf-8" }},
|
||||
{ "/completion.js", { completion_js, completion_js_len, "text/javascript; charset=utf-8" }},
|
||||
{ "/deps_daisyui.min.css", { deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8" }},
|
||||
{ "/deps_markdown-it.js", { deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8" }},
|
||||
{ "/deps_tailwindcss.js", { deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8" }},
|
||||
{ "/deps_vue.esm-browser.js", { deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8" }},
|
||||
};
|
||||
|
||||
std::unique_ptr<httplib::Server> svr;
|
||||
#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
|
||||
if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
|
||||
|
@ -2347,7 +2363,7 @@ int main(int argc, char ** argv) {
|
|||
// Middlewares
|
||||
//
|
||||
|
||||
auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||
auto middleware_validate_api_key = [¶ms, &res_error, &static_files](const httplib::Request & req, httplib::Response & res) {
|
||||
static const std::unordered_set<std::string> public_endpoints = {
|
||||
"/health",
|
||||
"/models",
|
||||
|
@ -2359,8 +2375,8 @@ int main(int argc, char ** argv) {
|
|||
return true;
|
||||
}
|
||||
|
||||
// If path is public, skip validation
|
||||
if (public_endpoints.find(req.path) != public_endpoints.end()) {
|
||||
// If path is public or is static file, skip validation
|
||||
if (public_endpoints.find(req.path) != public_endpoints.end() || static_files.find(req.path) != static_files.end()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -3104,13 +3120,6 @@ int main(int argc, char ** argv) {
|
|||
res.status = 200; // HTTP OK
|
||||
};
|
||||
|
||||
auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
|
||||
return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
|
||||
res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
|
||||
return false;
|
||||
};
|
||||
};
|
||||
|
||||
//
|
||||
// Router
|
||||
//
|
||||
|
@ -3125,12 +3134,13 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
} else {
|
||||
// using embedded static files
|
||||
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
||||
svr->Get("/completion.js", handle_static_file(completion_js, completion_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/deps_daisyui.min.css", handle_static_file(deps_daisyui_min_css, deps_daisyui_min_css_len, "text/css; charset=utf-8"));
|
||||
svr->Get("/deps_markdown-it.js", handle_static_file(deps_markdown_it_js, deps_markdown_it_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/deps_tailwindcss.js", handle_static_file(deps_tailwindcss_js, deps_tailwindcss_js_len, "text/javascript; charset=utf-8"));
|
||||
svr->Get("/deps_vue.esm-browser.js", handle_static_file(deps_vue_esm_browser_js, deps_vue_esm_browser_js_len, "text/javascript; charset=utf-8"));
|
||||
for (const auto & it : static_files) {
|
||||
const server_static_file & static_file = it.second;
|
||||
svr->Get(it.first.c_str(), [&static_file](const httplib::Request &, httplib::Response & res) {
|
||||
res.set_content(reinterpret_cast<const char*>(static_file.data), static_file.size, static_file.mime_type);
|
||||
return false;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// register API routes
|
||||
|
|
|
@ -150,6 +150,28 @@ static inline __m128i packNibbles( __m256i bytes )
|
|||
#endif
|
||||
}
|
||||
#elif defined(__AVX__)
|
||||
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
||||
{
|
||||
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
||||
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
||||
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
||||
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes1 = _mm_or_si128( low, high );
|
||||
high = _mm_andnot_si128( lowByte, bytes2 );
|
||||
low = _mm_and_si128( lowByte, bytes2 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes2 = _mm_or_si128( low, high );
|
||||
|
||||
return _mm_packus_epi16( bytes1, bytes2);
|
||||
}
|
||||
|
||||
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
||||
const __m128i ax = _mm_sign_epi8(x, x);
|
||||
const __m128i sy = _mm_sign_epi8(y, x);
|
||||
return _mm_maddubs_epi16(ax, sy);
|
||||
}
|
||||
|
||||
// spread 32 bits to 32 bytes { 0x00, 0xFF }
|
||||
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
|
||||
uint32_t x32;
|
||||
|
@ -217,26 +239,29 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
|||
return sum_i16_pairs_float(doth, dotl);
|
||||
}
|
||||
|
||||
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
||||
{
|
||||
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
||||
const __m128i lowByte = _mm_set1_epi16( 0xFF );
|
||||
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
|
||||
__m128i low = _mm_and_si128( lowByte, bytes1 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes1 = _mm_or_si128( low, high );
|
||||
high = _mm_andnot_si128( lowByte, bytes2 );
|
||||
low = _mm_and_si128( lowByte, bytes2 );
|
||||
high = _mm_srli_epi16( high, 4 );
|
||||
bytes2 = _mm_or_si128( low, high );
|
||||
// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
|
||||
static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
|
||||
const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
|
||||
const __m128i mone = _mm_set1_epi16(1);
|
||||
|
||||
return _mm_packus_epi16( bytes1, bytes2);
|
||||
const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
|
||||
const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
|
||||
const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
|
||||
const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
|
||||
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
||||
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
||||
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
||||
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
||||
const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
|
||||
const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
|
||||
return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
|
||||
}
|
||||
|
||||
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
||||
const __m128i ax = _mm_sign_epi8(x, x);
|
||||
const __m128i sy = _mm_sign_epi8(y, x);
|
||||
return _mm_maddubs_epi16(ax, sy);
|
||||
// quad fp16 delta calculation
|
||||
static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
|
||||
// GGML_FP16_TO_FP32 is faster than Intel F16C
|
||||
return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
|
||||
_mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
|
||||
}
|
||||
#endif
|
||||
#elif defined(__SSSE3__)
|
||||
|
@ -2004,10 +2029,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
|
||||
sumf = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
const __m128i mone = _mm_set1_epi16(1);
|
||||
|
||||
__m256 accum1 = _mm256_setzero_ps();
|
||||
__m256 accum2 = _mm256_setzero_ps();
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
||||
|
@ -2020,21 +2042,20 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
|
||||
const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
|
||||
const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
|
||||
|
||||
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
||||
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
||||
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
||||
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
||||
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
||||
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
||||
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
||||
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
||||
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
||||
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
||||
const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
|
||||
const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
|
||||
const __m256 p = sum_i16_pairs_float(p_2, p_1);
|
||||
|
||||
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||
sumf = hsum_float_8(accum);
|
||||
#elif defined(__SSSE3__)
|
||||
// set constants
|
||||
const __m128i lowMask = _mm_set1_epi8(0xF);
|
||||
|
@ -3535,7 +3556,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
}
|
||||
|
||||
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
#elif defined(__AVX2__)
|
||||
// Initialize accumulator with zeros
|
||||
__m256 acc = _mm256_setzero_ps();
|
||||
|
||||
|
@ -3549,14 +3570,29 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
|
|||
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
|
||||
|
||||
// Multiply q with scale and accumulate
|
||||
#if defined(__AVX2__)
|
||||
acc = _mm256_fmadd_ps( d, q, acc );
|
||||
#else
|
||||
acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
|
||||
#endif
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(acc);
|
||||
#elif defined(__AVX__)
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
|
||||
const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
|
||||
const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
||||
const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
|
||||
const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
|
||||
const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
|
||||
const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
||||
const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
|
||||
|
||||
const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
|
||||
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(accum);
|
||||
#elif defined(__riscv_v_intrinsic)
|
||||
size_t vl = __riscv_vsetvl_e8m1(qk);
|
||||
|
||||
|
@ -10322,10 +10358,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
#elif defined __AVX__
|
||||
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
|
||||
const __m128i m4b = _mm_set1_epi8(0x0f);
|
||||
const __m128i mone = _mm_set1_epi16(1);
|
||||
|
||||
__m256 accum1 = _mm256_setzero_ps();
|
||||
__m256 accum2 = _mm256_setzero_ps();
|
||||
__m256 accum = _mm256_setzero_ps();
|
||||
for (; ib + 1 < nb; ib += 2) {
|
||||
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
||||
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
||||
|
@ -10338,21 +10372,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
|
|||
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
|
||||
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
|
||||
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
|
||||
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
||||
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
||||
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
||||
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
||||
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
||||
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
||||
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
||||
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
||||
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
||||
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
|
||||
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
||||
|
||||
const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
|
||||
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
|
||||
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
|
||||
}
|
||||
|
||||
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
||||
sumf = hsum_float_8(accum);
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||
|
|
|
@ -1469,8 +1469,12 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
|
|||
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
|
||||
|
||||
#undef LOAD
|
||||
#elif defined(__AVX2__)
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
#if defined(__AVX2__)
|
||||
#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
|
||||
#else
|
||||
#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
|
||||
#endif
|
||||
__m256 c1 = _mm256_setzero_ps();
|
||||
__m256 c2 = _mm256_setzero_ps();
|
||||
__m256 c3 = _mm256_setzero_ps();
|
||||
|
|
|
@ -144,17 +144,17 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then
|
|||
-e 's/([[:space:]]|[ab]\/)CMakeLists.txt/\1ggml\/CMakeLists.txt/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/CMakeLists.txt/\1ggml\/src\/CMakeLists.txt/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)cmake\/FindSIMD.cmake/\1ggml\/cmake\/FindSIMD.cmake/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\1.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\1.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\1.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\1.cu/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\1.m/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.c/\1ggml\/src\/ggml\2.c/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cpp/\1ggml\/src\/ggml\2.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.h/\1ggml\/src\/ggml\2.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.cu/\1ggml\/src\/ggml\2.cu/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml(.*)\.m/\1ggml\/src\/ggml\2.m/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-amx\//\1ggml\/src\/ggml-amx\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-cann\//\1ggml\/src\/ggml-cann\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/ggml-sycl\//\1ggml\/src\/ggml-sycl\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)src\/vulkan-shaders\//\1ggml\/src\/vulkan-shaders\//g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\1.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)include\/ggml(.*)\.h/\1ggml\/include\/ggml\2.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common\.h/\1examples\/common.h/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common\.cpp/\1examples\/common.cpp/g' \
|
||||
-e 's/([[:space:]]|[ab]\/)examples\/common-ggml\.h/\1examples\/common-ggml.h/g' \
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue