Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import soundfile
|
3 |
+
import time
|
4 |
+
import torch
|
5 |
+
import scipy.io.wavfile
|
6 |
+
from espnet2.bin.tts_inference import Text2Speech
|
7 |
+
from espnet2.utils.types import str_or_none
|
8 |
+
from espnet2.bin.asr_inference import Speech2Text
|
9 |
+
from subprocess import call
|
10 |
+
import os
|
11 |
+
from espnet_model_zoo.downloader import ModelDownloader
|
12 |
+
# print(a1)
|
13 |
+
# exit()
|
14 |
+
# exit()
|
15 |
+
# tagen = 'kan-bayashi/ljspeech_vits'
|
16 |
+
# vocoder_tagen = "none"
|
17 |
+
|
18 |
+
speech2text_slurp = Speech2Text.from_pretrained(
|
19 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
20 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
21 |
+
# Decoding parameters are not included in the model file
|
22 |
+
lang_prompt_token="<|en|> <|ner|> <|SLURP|>",
|
23 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
24 |
+
nbest=1
|
25 |
+
)
|
26 |
+
|
27 |
+
speech2text_fsc = Speech2Text.from_pretrained(
|
28 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
29 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
30 |
+
# Decoding parameters are not included in the model file
|
31 |
+
lang_prompt_token="<|en|> <|ic|> <|fsc|>",
|
32 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
33 |
+
nbest=1
|
34 |
+
)
|
35 |
+
|
36 |
+
speech2text_grabo = Speech2Text.from_pretrained(
|
37 |
+
asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
|
38 |
+
asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
|
39 |
+
# Decoding parameters are not included in the model file
|
40 |
+
lang_prompt_token="<|nl|> <|scr|> <|grabo_scr|>",
|
41 |
+
prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
|
42 |
+
nbest=1
|
43 |
+
)
|
44 |
+
|
45 |
+
def inference(wav,data):
|
46 |
+
with torch.no_grad():
|
47 |
+
if data == "english_slurp":
|
48 |
+
speech, rate = soundfile.read(wav.name)
|
49 |
+
nbests = speech2text_slurp(speech)
|
50 |
+
text, *_ = nbests[0]
|
51 |
+
# intent=text.split(" ")[0]
|
52 |
+
# scenario=intent.split("_")[0]
|
53 |
+
# action=intent.split("_")[1]
|
54 |
+
# text="{scenario: "+scenario+", action: "+action+"}"
|
55 |
+
elif data == "english_fsc":
|
56 |
+
print(wav.name)
|
57 |
+
speech, rate = soundfile.read(wav.name)
|
58 |
+
print(speech.shape)
|
59 |
+
if len(speech.shape)==2:
|
60 |
+
speech=speech[:,0]
|
61 |
+
# soundfile.write("store_file.wav", speech, rate, subtype='FLOAT')
|
62 |
+
print(speech.shape)
|
63 |
+
nbests = speech2text_fsc(speech)
|
64 |
+
text, *_ = nbests[0]
|
65 |
+
# intent=text.split(" ")[0]
|
66 |
+
# action=intent.split("_")[0]
|
67 |
+
# objects=intent.split("_")[1]
|
68 |
+
# location=intent.split("_")[2]
|
69 |
+
# text="{action: "+action+", object: "+objects+", location: "+location+"}"
|
70 |
+
# elif data == "english_snips":
|
71 |
+
# print(wav.name)
|
72 |
+
# speech, rate = soundfile.read(wav.name)
|
73 |
+
# nbests = speech2text_snips(speech)
|
74 |
+
# text, *_ = nbests[0]
|
75 |
+
elif data == "dutch":
|
76 |
+
print(wav.name)
|
77 |
+
speech, rate = soundfile.read(wav.name)
|
78 |
+
nbests = speech2text_grabo(speech)
|
79 |
+
text, *_ = nbests[0]
|
80 |
+
# intent=text.split(" ")[0]
|
81 |
+
# action=intent.split("_")[0]
|
82 |
+
# objects=intent.split("_")[1]
|
83 |
+
# location=intent.split("_")[2]
|
84 |
+
# text="{action: "+action+", object: "+objects+", location: "+location+"}"
|
85 |
+
|
86 |
+
# if lang == "chinese":
|
87 |
+
# wav = text2speechch(text)["wav"]
|
88 |
+
# scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
|
89 |
+
# if lang == "japanese":
|
90 |
+
# wav = text2speechjp(text)["wav"]
|
91 |
+
# scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
|
92 |
+
return text
|
93 |
+
|
94 |
+
title = "UniverSLU"
|
95 |
+
description = "Gradio demo for UniverSLU: Universal Spoken Language Understanding for Diverse Tasks with Natural Language Instructions. To use it, simply record your audio or click one of the examples to load them. Read more at the links below."
|
96 |
+
article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
|
97 |
+
|
98 |
+
examples=[['audio_slurp.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch"]]
|
99 |
+
|
100 |
+
# gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
|
101 |
+
gr.Interface(
|
102 |
+
inference,
|
103 |
+
[gr.inputs.Audio(label="input audio",source = "microphone", type="file"),gr.inputs.Radio(choices=["english_slurp","english_fsc","dutch_scd"], type="value", default="english_fsc", label="Task")],
|
104 |
+
gr.outputs.Textbox(type="str", label="Output"),
|
105 |
+
title=title,
|
106 |
+
description=description,
|
107 |
+
article=article,
|
108 |
+
enable_queue=True,
|
109 |
+
examples=examples
|
110 |
+
).launch(debug=True)
|