ONNX usage

#14
by hexgrad - opened

Quick and dirty ONNX usage:

# Assuming you are in the Kokoro-82M directory

!pip install onnxruntime

from onnxruntime import InferenceSession

# Tokens produced by phonemize() and tokenize() in kokoro.py
tokens = [50, 157, 43, 135, 16, 53, 135, 46, 16, 43, 102, 16, 56, 156, 57, 135, 6, 16, 102, 62, 61, 16, 70, 56, 16, 138, 56, 156, 72, 56, 61, 85, 123, 83, 44, 83, 54, 16, 53, 65, 156, 86, 61, 62, 131, 83, 56, 4, 16, 54, 156, 43, 102, 53, 16, 156, 72, 61, 53, 102, 112, 16, 70, 56, 16, 138, 56, 44, 156, 76, 158, 123, 56, 16, 62, 131, 156, 43, 102, 54, 46, 16, 102, 48, 16, 81, 47, 102, 54, 16, 54, 156, 51, 158, 46, 16, 70, 16, 92, 156, 135, 46, 16, 54, 156, 43, 102, 48, 4, 16, 81, 47, 102, 16, 50, 156, 72, 64, 83, 56, 62, 16, 156, 51, 158, 64, 83, 56, 16, 44, 157, 102, 56, 16, 44, 156, 76, 158, 123, 56, 4]

# Context length is 512, but leave room for the pad token 0 at the start & end
assert len(tokens) <= 510, len(tokens)

# Style vector based on len(tokens), ref_s has shape (1, 256)
ref_s = torch.load('voices/af.pt')[len(tokens)].numpy()

# Add the pad ids, and reshape tokens, should now have shape (1, <=512)
tokens = [[0, *tokens, 0]]

sess = InferenceSession('kokoro-v0_19.onnx')

audio = sess.run(None, dict(
    tokens=tokens, 
    style=ref_s,
    speed=np.ones(1, dtype=np.float32)
))[0]

from IPython.display import display, Audio
display(Audio(data=audio, rate=24000, autoplay=True))

!pip install onnxruntime # NotImplementedError: A UTF-8 locale is required. Got ANSI_X3.4-1968

# # 1️⃣ Install dependencies silently
!git clone https://huggingface.co/hexgrad/Kokoro-82M
%cd Kokoro-82M
!apt-get -qq -y install espeak-ng > /dev/null 2>&1
!pip install -q phonemizer torch transformers scipy munch

## 1️⃣ Instalar dependencias necesarias
!pip install -q pydub
# Assuming you are in the Kokoro-82M directory
from models import build_model
import torch
import numpy as np
from onnxruntime import InferenceSession

# Tokens produced by phonemize() and tokenize() in kokoro.py
tokens = [50, 157, 43, 135, 16, 53, 135, 46, 16, 43, 102, 16, 56, 156, 57, 135, 6, 16, 102, 62, 61, 16, 70, 56, 16, 138, 56, 156, 72, 56, 61, 85, 123, 83, 44, 83, 54, 16, 53, 65, 156, 86, 61, 62, 131, 83, 56, 4, 16, 54, 156, 43, 102, 53, 16, 156, 72, 61, 53, 102, 112, 16, 70, 56, 16, 138, 56, 44, 156, 76, 158, 123, 56, 16, 62, 131, 156, 43, 102, 54, 46, 16, 102, 48, 16, 81, 47, 102, 54, 16, 54, 156, 51, 158, 46, 16, 70, 16, 92, 156, 135, 46, 16, 54, 156, 43, 102, 48, 4, 16, 81, 47, 102, 16, 50, 156, 72, 64, 83, 56, 62, 16, 156, 51, 158, 64, 83, 56, 16, 44, 157, 102, 56, 16, 44, 156, 76, 158, 123, 56, 4]

# Context length is 512, but leave room for the pad token 0 at the start & end
assert len(tokens) <= 510, len(tokens)

# Style vector based on len(tokens), ref_s has shape (1, 256)
# ref_s = torch.load('voices/af.pt')[len(tokens)].numpy()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
MODEL = build_model('kokoro-v0_19.pth', device)
VOICE_NAME = [
    'af', # Default voice is a 50-50 mix of Bella & Sarah
    'af_bella', 'af_sarah', 'am_adam', 'am_michael',
    'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
    'af_nicole', 'af_sky',
][6]
VOICEPACK = torch.load(f'voices/{VOICE_NAME}.pt')[len(tokens)].numpy()
print(f'Loaded voice: {VOICE_NAME}')

# Add the pad ids, and reshape tokens, should now have shape (1, <=512)
tokens = [[0, *tokens, 0]]

sess = InferenceSession('kokoro-v0_19.onnx')

audio = sess.run(None, dict(
    tokens=tokens, 
    style=VOICEPACK,
    speed=np.ones(1, dtype=np.float32)
))[0]

from IPython.display import display, Audio
display(Audio(data=audio, rate=24000, autoplay=True))

Im getting an error with CoreML

>>> from onnxruntime import InferenceSession
>>> sess = InferenceSession('kokoro-v0_19.onnx', providers=['CoreMLExecutionProvider'])
2025-01-02 15:26:59.464091 [W:onnxruntime:, helper.cc:88 IsInputSupported] CoreML does not support shapes with dimension values of 0. Input:/Slice_1_output_0, shape: {0}
2025-01-02 15:26:59.464275 [W:onnxruntime:, helper.cc:88 IsInputSupported] CoreML does not support shapes with dimension values of 0. Input:/decoder/generator/m_source/l_sin_gen/Slice_output_0, shape: {0}
2025-01-02 15:26:59.464510 [W:onnxruntime:, helper.cc:82 IsInputSupported] CoreML does not support input dim > 16384. Input:decoder.generator.stft.stft.window_sum, shape: {5000015}
2025-01-02 15:26:59.465130 [W:onnxruntime:, coreml_execution_provider.cc:115 GetCapability] CoreMLExecutionProvider::GetCapability, number of partitions supported by CoreML: 123 number of nodes in the graph: 2361 number of nodes supported by CoreML: 949
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "venv/lib/python3.12/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py", line 465, in __init__
    self._create_inference_session(providers, provider_options, disabled_optimizers)
  File "venv/lib/python3.12/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py", line 537, in _create_inference_session
    sess.initialize_session(providers, provider_options, disabled_optimizers)
onnxruntime.capi.onnxruntime_pybind11_state.Fail: [ONNXRuntimeError] : 1 : FAIL : model_builder.cc:768 RegisterModelInputOutput Unable to get shape for output: /Squeeze_output_0

Thanks for the usage!
I published ready to use package here

https://github.com/thewh1teagle/kokoro-onnx

Quick and dirty ONNX usage:

# Assuming you are in the Kokoro-82M directory

Can you share the code used to convert it to onnx? I would like to try quantize it to fp16 / int8 for faster inference
Also, is there some Github repository related to the project? as it's easy to work with code in Github rather than HuggingFace

Sign up or log in to comment