examples : add embd_to_audio to tts-outetts.py [no ci] (#11235)

This commit contains a suggestion for adding the missing embd_to_audio function from tts.cpp to tts-outetts.py. This introduces a depencency numpy which I was not sure if that is acceptable or not (only PyTorch was mentioned in referened PR). Also the README has been updated with instructions to run the example with llama-server and the python script. Refs: https://github.com/ggerganov/llama.cpp/pull/10784#issuecomment-2548377734
2025-01-15 05:44:38 +01:00 · 2025-01-15 05:44:38 +01:00 · 0ccd7f3eb2
commit 0ccd7f3eb2
parent f446c2cf6a
2 changed files with 163 additions and 2 deletions
--- a/examples/tts/README.md
+++ b/examples/tts/README.md
@ -78,3 +78,40 @@ play the audio:
 $ aplay output.wav
 ```
 ### Running the example with llama-server
 Running this example with `llama-server` is also possible and requires two
 server instances to be started. One will serve the LLM model and the other
 will serve the voice decoder model.
 The LLM model server can be started with the following command:
 ```console
 $ ./build/bin/llama-server -m ./models/outetts-0.2-0.5B-q8_0.gguf --port 8020
 ```
 And the voice decoder model server can be started using:
 ```console
 ./build/bin/llama-server -m ./models/wavtokenizer-large-75-f16.gguf --port 8021 --embeddings --pooling none
 ```
 Then we can run [tts-outetts.py](tts-outetts.py) to generate the audio.
 First create a virtual environment for python and install the required
 dependencies (this in only required to be done once):
 ```console
 $ python3 -m venv venv
 $ source venv/bin/activate
 (venv) pip install requests numpy
 ```
 And then run the python script using:
 ```conole
 (venv) python ./examples/tts/tts-outetts.py http://localhost:8020 http://localhost:8021 "Hello world"
 spectrogram generated: n_codes: 90, n_embd: 1282
 converting to audio ...
 audio generated: 28800 samples
 audio written to file "output.wav"
 ```
 And to play the audio we can again use aplay or any other media player:
 ```console
 $ aplay output.wav
 ```
--- a/examples/tts/tts-outetts.py
+++ b/examples/tts/tts-outetts.py
@ -3,6 +3,121 @@ import sys
 #import struct
 import requests
 import re
 import struct
 import numpy as np
 from concurrent.futures import ThreadPoolExecutor
 def fill_hann_window(size, periodic=True):
    if periodic:
        return np.hanning(size + 1)[:-1]
    return np.hanning(size)
 def irfft(n_fft, complex_input):
    return np.fft.irfft(complex_input, n=n_fft)
 def fold(buffer, n_out, n_win, n_hop, n_pad):
    result = np.zeros(n_out)
    n_frames = len(buffer) // n_win
    for i in range(n_frames):
        start = i * n_hop
        end = start + n_win
        result[start:end] += buffer[i * n_win:(i + 1) * n_win]
    return result[n_pad:-n_pad] if n_pad > 0 else result
 def process_frame(args):
    l, n_fft, ST, hann = args
    frame = irfft(n_fft, ST[l])
    frame = frame * hann
    hann2 = hann * hann
    return frame, hann2
 def embd_to_audio(embd, n_codes, n_embd, n_thread=4):
    embd = np.asarray(embd, dtype=np.float32).reshape(n_codes, n_embd)
    n_fft = 1280
    n_hop = 320
    n_win = 1280
    n_pad = (n_win - n_hop) // 2
    n_out = (n_codes - 1) * n_hop + n_win
    hann = fill_hann_window(n_fft, True)
    E = np.zeros((n_embd, n_codes), dtype=np.float32)
    for l in range(n_codes):
        for k in range(n_embd):
            E[k, l] = embd[l, k]
    half_embd = n_embd // 2
    S = np.zeros((n_codes, half_embd + 1), dtype=np.complex64)
    for k in range(half_embd):
        for l in range(n_codes):
            mag = E[k, l]
            phi = E[k + half_embd, l]
            mag = np.clip(np.exp(mag), 0, 1e2)
            S[l, k] = mag * np.exp(1j * phi)
    res = np.zeros(n_codes * n_fft)
    hann2_buffer = np.zeros(n_codes * n_fft)
    with ThreadPoolExecutor(max_workers=n_thread) as executor:
        args = [(l, n_fft, S, hann) for l in range(n_codes)]
        results = list(executor.map(process_frame, args))
        for l, (frame, hann2) in enumerate(results):
            res[l*n_fft:(l+1)*n_fft] = frame
            hann2_buffer[l*n_fft:(l+1)*n_fft] = hann2
    audio = fold(res, n_out, n_win, n_hop, n_pad)
    env = fold(hann2_buffer, n_out, n_win, n_hop, n_pad)
    mask = env > 1e-10
    audio[mask] /= env[mask]
    return audio
 def save_wav(filename, audio_data, sample_rate):
    num_channels = 1
    bits_per_sample = 16
    bytes_per_sample = bits_per_sample // 8
    data_size = len(audio_data) * bytes_per_sample
    byte_rate = sample_rate * num_channels * bytes_per_sample
    block_align = num_channels * bytes_per_sample
    chunk_size = 36 + data_size  # 36 = size of header minus first 8 bytes
    header = struct.pack(
        '<4sI4s4sIHHIIHH4sI',
        b'RIFF',
        chunk_size,
        b'WAVE',
        b'fmt ',
        16,                # fmt chunk size
        1,                 # audio format (PCM)
        num_channels,
        sample_rate,
        byte_rate,
        block_align,
        bits_per_sample,
        b'data',
        data_size
    )
    audio_data = np.clip(audio_data * 32767, -32768, 32767)
    pcm_data = audio_data.astype(np.int16)
    with open(filename, 'wb') as f:
        f.write(header)
        f.write(pcm_data.tobytes())
 def process_text(text: str):
    text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed
@ -170,6 +285,15 @@ n_embd = len(embd[0])
 print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd))
 # post-process the spectrogram to convert to audio
 # TODO: see the tts.cpp:embd_to_audio() and implement it in Python
 print('converting to audio ...')
-print('TODO: see the tts.cpp:embd_to_audio() and implement it in Python')
+audio = embd_to_audio(embd, n_codes, n_embd)
 print('audio generated: %d samples' % len(audio))
 filename = "output.wav"
 sample_rate = 24000 # sampling rate
 # zero out first 0.25 seconds
 audio[:24000 // 4] = 0.0
 save_wav(filename, audio, sample_rate)
 print('audio written to file "%s"' % filename)