File size: 2,592 Bytes
e67043b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from __future__ import annotations

from typing import TYPE_CHECKING

from gradio_client.client import Job

from gradio_tools.tools.gradio_tool import GradioTool

if TYPE_CHECKING:
    import gradio as gr

SUPPORTED_LANGS = [
    ("English", "en"),
    ("German", "de"),
    ("Spanish", "es"),
    ("French", "fr"),
    ("Hindi", "hi"),
    ("Italian", "it"),
    ("Japanese", "ja"),
    ("Korean", "ko"),
    ("Polish", "pl"),
    ("Portuguese", "pt"),
    ("Russian", "ru"),
    ("Turkish", "tr"),
    ("Chinese", "zh"),
]

SUPPORTED_LANGS = {lang: code for lang, code in SUPPORTED_LANGS}
VOICES = ["Unconditional", "Announcer"]
SUPPORTED_SPEAKERS = VOICES + [p for p in SUPPORTED_LANGS]

NON_SPEECH_TOKENS = [
    "[laughter]",
    "[laughs]",
    "[sighs]",
    "[music]",
    "[gasps]",
    "[clears throat]",
    "'♪' for song lyrics. Put ♪ on either side of the the text",
    "'…' for hesitations",
]


class BarkTextToSpeechTool(GradioTool):
    """Tool for calling bark text-to-speech llm."""

    def __init__(
        self,
        name="BarkTextToSpeech",
        description=(
            "A tool for text-to-speech. Use this tool to convert text "
            "into sounds that sound like a human read it. Input will be a two strings separated by a |: "
            "the first will be the text to read. The second will be the desired speaking language. "
            f"It MUST be one of the following choices {','.join(SUPPORTED_SPEAKERS)}. "
            f"Additionally, you can include the following non speech tokens: {NON_SPEECH_TOKENS}"
            "The output will the text transcript of that file."
        ),
        src="suno/bark",
        hf_token=None,
        duplicate=False,
    ) -> None:
        super().__init__(name, description, src, hf_token, duplicate)

    def create_job(self, query: str) -> Job:
        try:
            text, speaker = (
                query[: query.rindex("|")],
                query[(query.rindex("|") + 1) :].strip(),
            )
        except ValueError:
            text, speaker = query, "Unconditional"
        if speaker in VOICES:
            pass
        elif speaker in SUPPORTED_LANGS:
            speaker = f"Speaker 0 ({SUPPORTED_LANGS[speaker]})"
        else:
            speaker = "Unconditional"
        return self.client.submit(text, speaker, fn_index=3)

    def postprocess(self, output: str) -> str:
        return output

    def _block_input(self, gr) -> "gr.components.Component":
        return gr.Textbox()

    def _block_output(self, gr) -> "gr.components.Component":
        return gr.Audio()