ok
This commit is contained in:
parent
3744bc4bc5
commit
a8153cc681
5 changed files with 107 additions and 48 deletions
|
@ -2215,6 +2215,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||
params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
|
||||
params.vocoder.hf_repo = "ggml-org/WavTokenizer";
|
||||
params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf";
|
||||
params.ctx_shift = false; // for better results
|
||||
}
|
||||
).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER}));
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<title>llama.cpp TTS</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: 'Courier New', Courier, monospace;
|
||||
font-family: monospace;
|
||||
margin: 2em;
|
||||
}
|
||||
</style>
|
||||
|
@ -16,10 +16,15 @@
|
|||
|
||||
Input text:<br/>
|
||||
<textarea id="input" rows="4" cols="50">Hello world</textarea><br/>
|
||||
<button id="btn_speak" onclick="speak()">Speak</button>
|
||||
<button id="btn_speak" onclick="speak()">Speak</button><br/>
|
||||
<br/>
|
||||
<p id="status">Status: ready</p><br/>
|
||||
<p id="output"></p>
|
||||
|
||||
<script>
|
||||
const input_el = document.getElementById('input');
|
||||
const output_el = document.getElementById('output');
|
||||
const status_el = document.getElementById('status');
|
||||
const btn_speak_el = document.getElementById('btn_speak');
|
||||
|
||||
let working = false;
|
||||
|
@ -32,6 +37,9 @@
|
|||
working = true;
|
||||
input_el.disabled = true;
|
||||
btn_speak_el.disabled = true;
|
||||
status_el.textContent = 'Status: generating...';
|
||||
|
||||
const input = input_el.value.trim();
|
||||
|
||||
try {
|
||||
const res = await fetch('/v1/audio/speech', {
|
||||
|
@ -40,7 +48,7 @@
|
|||
'Content-Type': 'application/json'
|
||||
},
|
||||
body: JSON.stringify({
|
||||
input: input_el.value.trim(),
|
||||
input,
|
||||
response_format: 'wav',
|
||||
}),
|
||||
});
|
||||
|
@ -50,6 +58,11 @@
|
|||
const url = URL.createObjectURL(blob);
|
||||
const audio = new Audio(url);
|
||||
audio.play();
|
||||
status_el.textContent = 'Status: playing...';
|
||||
audio.addEventListener('ended', () => {
|
||||
status_el.textContent = 'Status: ready';
|
||||
});
|
||||
echoTimings(res.headers, input);
|
||||
} else {
|
||||
const text = await res.text();
|
||||
throw new Error(`Failed to generate speech: ${text}`);
|
||||
|
@ -57,12 +70,43 @@
|
|||
} catch (e) {
|
||||
console.error(e);
|
||||
alert(e.message);
|
||||
status_el.textContent = 'Status: ready';
|
||||
}
|
||||
|
||||
working = false;
|
||||
input_el.disabled = false;
|
||||
btn_speak_el.disabled = false;
|
||||
}
|
||||
|
||||
function echoTimings(headers, input_txt) {
|
||||
try {
|
||||
const timingsTTC = JSON.parse(headers.get('X-timings-ttc'));
|
||||
const timingsVoc = JSON.parse(headers.get('X-timings-voc'));
|
||||
const timingsSpec = JSON.parse(headers.get('X-timings-spec'));
|
||||
output_el.innerHTML = `
|
||||
<b>Input text:</b> ${escapeHtml(input_txt)}<br/>
|
||||
<b>Timings:</b><br/>
|
||||
<b>TTC:</b>
|
||||
<ul>
|
||||
${Object.entries(timingsTTC).map(([k, v]) => `<li>${k}: ${v.toFixed(2)} ms</li>`).join('')}
|
||||
</ul>
|
||||
<b>Voc:</b> ${timingsVoc.t_voc_ms.toFixed(2)} ms<br/>
|
||||
<b>Spec:</b> ${timingsSpec.t_spec_ms.toFixed(2)} ms
|
||||
`;
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
output_el.innerHTML = 'No timings data is available.';
|
||||
}
|
||||
}
|
||||
|
||||
function escapeHtml(unsafe) {
|
||||
return unsafe
|
||||
.replace(/&/g, "&")
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">")
|
||||
.replace(/"/g, """)
|
||||
.replace(/'/g, "'");
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -942,6 +942,7 @@ struct server_task_result_embd : server_task_result {
|
|||
struct server_task_result_tts_embd : server_task_result {
|
||||
int index = 0;
|
||||
std::vector<float> embd;
|
||||
double t_ms = 0.0;
|
||||
|
||||
virtual int get_index() override {
|
||||
return index; // unused
|
||||
|
@ -1749,11 +1750,15 @@ struct server_context {
|
|||
|
||||
if (!params.vocoder.model.empty()) {
|
||||
common_params v_params = params_base;
|
||||
v_params.model = params.vocoder.model;
|
||||
v_params.model_url = params.vocoder.model_url;
|
||||
v_params.hf_repo = params.vocoder.hf_repo;
|
||||
v_params.hf_file = params.vocoder.hf_file;
|
||||
v_params.embedding = true;
|
||||
v_params.model = params.vocoder.model;
|
||||
v_params.model_url = params.vocoder.model_url;
|
||||
v_params.hf_repo = params.vocoder.hf_repo;
|
||||
v_params.hf_file = params.vocoder.hf_file;
|
||||
v_params.embedding = true;
|
||||
v_params.pooling_type = LLAMA_POOLING_TYPE_NONE;
|
||||
// make sure the vocoder has the sufficient batch size
|
||||
v_params.n_batch = v_params.n_ctx;
|
||||
v_params.n_ubatch = v_params.n_ctx;
|
||||
llama_init_vocoder = common_init_from_params(v_params);
|
||||
}
|
||||
|
||||
|
@ -2606,9 +2611,18 @@ struct server_context {
|
|||
} break;
|
||||
case SERVER_TASK_TYPE_TTS_EMBD:
|
||||
{
|
||||
const auto ctx_cts = llama_init_vocoder.context.get();
|
||||
const int n_ubatch = llama_n_ubatch(ctx_cts);
|
||||
const int n_codes = (int) task.prompt_tokens.size();
|
||||
if (n_codes > n_ubatch) {
|
||||
send_error(task, string_format("Number of codes (%d) exceeds the maximum ubatch of vocoder model (%d)", n_codes, n_ubatch), ERROR_TYPE_INVALID_REQUEST);
|
||||
break;
|
||||
}
|
||||
|
||||
std::vector<float> embd;
|
||||
SRV_DBG("tts_get_embd with %d tokens", (int) task.prompt_tokens.size());
|
||||
int status = tts_get_embd(llama_init_vocoder.context.get(), task.prompt_tokens, embd);
|
||||
uint64_t t_start = ggml_time_us();
|
||||
SRV_DBG("tts_get_embd with %d codes", n_codes);
|
||||
int status = tts_get_embd(ctx_cts, task.prompt_tokens, embd);
|
||||
if (status != 0) {
|
||||
send_error(task, string_format("Failed to get TTS embedding, status code = %d", status), ERROR_TYPE_SERVER);
|
||||
break;
|
||||
|
@ -2620,6 +2634,7 @@ struct server_context {
|
|||
auto res = std::make_unique<server_task_result_tts_embd>();
|
||||
res->id = task.id;
|
||||
res->embd = std::move(embd);
|
||||
res->t_ms = (ggml_time_us() - t_start) / 1e3;
|
||||
queue_results.send(std::move(res));
|
||||
} break;
|
||||
}
|
||||
|
@ -4149,8 +4164,9 @@ int main(int argc, char ** argv) {
|
|||
return;
|
||||
}
|
||||
|
||||
llama_tokens audio_tokens;
|
||||
// convert text to audio token
|
||||
llama_tokens codes;
|
||||
result_timings ttc_timings;
|
||||
// convert text to codes
|
||||
{
|
||||
server_task task = server_task(SERVER_TASK_TYPE_COMPLETION);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
|
@ -4174,32 +4190,35 @@ int main(int argc, char ** argv) {
|
|||
const server_task_result_cmpl_final * result = dynamic_cast<server_task_result_cmpl_final*>(raw_result.get());
|
||||
GGML_ASSERT(result != nullptr);
|
||||
GGML_ASSERT(!result->tokens.empty());
|
||||
audio_tokens = std::move(result->tokens);
|
||||
codes = std::move(result->tokens);
|
||||
|
||||
// debug
|
||||
SRV_DBG("codes str (before filter) = %s\n", common_detokenize(ctx_server.ctx, audio_tokens, true).c_str());
|
||||
// SRV_DBG("codes str (before filter) = %s\n", common_detokenize(ctx_server.ctx, codes, true).c_str());
|
||||
|
||||
// post-process audio tokens
|
||||
// post-process codes
|
||||
// remove all non-audio tokens (i.e. < 151672 || > 155772)
|
||||
audio_tokens.erase(std::remove_if(
|
||||
audio_tokens.begin(),
|
||||
audio_tokens.end(),
|
||||
codes.erase(std::remove_if(
|
||||
codes.begin(),
|
||||
codes.end(),
|
||||
[](llama_token t) { return t < 151672 || t > 155772; }),
|
||||
audio_tokens.end());
|
||||
SRV_DBG("codes size = %d\n", (int) audio_tokens.size());
|
||||
codes.end());
|
||||
SRV_DBG("codes size = %d\n", (int) codes.size());
|
||||
|
||||
ttc_timings = std::move(result->timings);
|
||||
}
|
||||
|
||||
// debug
|
||||
SRV_DBG("codes str = %s\n", common_detokenize(ctx_server.ctx, audio_tokens, true).c_str());
|
||||
// SRV_DBG("codes str = %s\n", common_detokenize(ctx_server.ctx, codes, true).c_str());
|
||||
|
||||
// convert audio token to embeddings
|
||||
int n_embd = llama_n_embd(ctx_server.model);
|
||||
// convert codes to embeddings
|
||||
int n_embd = llama_n_embd(ctx_server.llama_init_vocoder.model.get());
|
||||
int n_codes = -1;
|
||||
double t_voc_ms = 0.0;
|
||||
std::vector<float> embd;
|
||||
{
|
||||
server_task task = server_task(SERVER_TASK_TYPE_TTS_EMBD);
|
||||
task.id = ctx_server.queue_tasks.get_new_id();
|
||||
task.prompt_tokens = std::move(audio_tokens);
|
||||
task.prompt_tokens = std::move(codes);
|
||||
|
||||
ctx_server.queue_results.add_waiting_tasks({task});
|
||||
ctx_server.queue_tasks.post(task);
|
||||
|
@ -4215,16 +4234,25 @@ int main(int argc, char ** argv) {
|
|||
GGML_ASSERT(!result->embd.empty());
|
||||
|
||||
// flatten the array
|
||||
n_codes = result->embd.size() / n_embd;
|
||||
embd = std::move(result->embd);
|
||||
SRV_DBG("tts embd n_code = %d\n", n_codes);
|
||||
SRV_DBG("tts embd size = %zu\n", embd.size());
|
||||
n_codes = result->embd.size() / n_embd;
|
||||
embd = std::move(result->embd);
|
||||
t_voc_ms = result->t_ms;
|
||||
SRV_DBG("tts embd n_code = %d\n", n_codes);
|
||||
SRV_DBG("tts embd size = %zu\n", embd.size());
|
||||
SRV_DBG("tts embd t_voc_ms = %lf\n", t_voc_ms);
|
||||
GGML_ASSERT(n_codes > 0);
|
||||
}
|
||||
|
||||
// convert embeddings to wav
|
||||
// will be freed by chunked_content_provider
|
||||
const auto t_spec_start = ggml_time_us();
|
||||
std::vector<float> audio = tts_embd_to_audio(embd.data(), n_codes, n_embd, params.cpuparams.n_threads);
|
||||
double t_spec_ms = (ggml_time_us() - t_spec_start) / 1e3;
|
||||
|
||||
// for now, we can only leave timings in response headers, mostly for debugging
|
||||
res.set_header("X-timings-ttc", ttc_timings.to_json().dump());
|
||||
res.set_header("X-timings-voc", (json{{ "t_voc_ms", t_voc_ms }}).dump());
|
||||
res.set_header("X-timings-spec", (json{{ "t_spec_ms", t_spec_ms }}).dump());
|
||||
|
||||
const auto chunked_content_provider = [audio = std::move(audio)](size_t, httplib::DataSink & sink) mutable {
|
||||
// TODO: some how reuse save_wav16 instead of duplicating the code here
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
//
|
||||
// Terminal utils
|
||||
|
|
|
@ -346,28 +346,14 @@ int main(int argc, char ** argv) {
|
|||
LOG_INF("%s: codes audio size: %d\n", __func__, (int) codes.size());
|
||||
}
|
||||
|
||||
for (auto & token : codes) {
|
||||
token -= 151672;
|
||||
}
|
||||
|
||||
const auto t_voc_start = ggml_time_us();
|
||||
|
||||
const int n_codes = codes.size();
|
||||
|
||||
llama_batch batch = llama_batch_init(n_codes, 0, 1);
|
||||
|
||||
for (size_t i = 0; i < codes.size(); ++i) {
|
||||
common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits?
|
||||
}
|
||||
GGML_ASSERT(batch.n_tokens == n_codes);
|
||||
|
||||
if (llama_decode(ctx_cts, batch) != 0) {
|
||||
LOG_ERR("%s: llama_decode() failed\n", __func__);
|
||||
std::vector<float> embd;
|
||||
if (tts_get_embd(ctx_cts, codes, embd) != 0) {
|
||||
LOG_ERR("%s: tts_get_embd() failed\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_synchronize(ctx_cts);
|
||||
|
||||
LOG_INF("%s: time for vocoder: %.3f ms\n", __func__, (ggml_time_us() - t_voc_start) / 1000.0f);
|
||||
|
||||
const auto t_spec_start = ggml_time_us();
|
||||
|
@ -375,10 +361,9 @@ int main(int argc, char ** argv) {
|
|||
#if 1
|
||||
// spectral operations
|
||||
const int n_embd = llama_n_embd(model_cts);
|
||||
const float * embd = llama_get_embeddings(ctx_cts);
|
||||
|
||||
auto audio = tts_embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads);
|
||||
const int n_codes = codes.size();
|
||||
|
||||
auto audio = tts_embd_to_audio(embd.data(), n_codes, n_embd, params.cpuparams.n_threads);
|
||||
#else
|
||||
// read the spectrogram from a file for debugging purposes
|
||||
std::vector<float> audio;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue