Siddhant commited on
Commit
a7c2f52
·
verified ·
1 Parent(s): 3d220e6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import soundfile
3
+ import time
4
+ import torch
5
+ import scipy.io.wavfile
6
+ from espnet2.bin.tts_inference import Text2Speech
7
+ from espnet2.utils.types import str_or_none
8
+ from espnet2.bin.asr_inference import Speech2Text
9
+ from subprocess import call
10
+ import os
11
+ from espnet_model_zoo.downloader import ModelDownloader
12
+ # print(a1)
13
+ # exit()
14
+ # exit()
15
+ # tagen = 'kan-bayashi/ljspeech_vits'
16
+ # vocoder_tagen = "none"
17
+
18
+ speech2text_slurp = Speech2Text.from_pretrained(
19
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
20
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
21
+ # Decoding parameters are not included in the model file
22
+ lang_prompt_token="<|en|> <|ner|> <|SLURP|>",
23
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
24
+ nbest=1
25
+ )
26
+
27
+ speech2text_fsc = Speech2Text.from_pretrained(
28
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
29
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
30
+ # Decoding parameters are not included in the model file
31
+ lang_prompt_token="<|en|> <|ic|> <|fsc|>",
32
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
33
+ nbest=1
34
+ )
35
+
36
+ speech2text_grabo = Speech2Text.from_pretrained(
37
+ asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
38
+ asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
39
+ # Decoding parameters are not included in the model file
40
+ lang_prompt_token="<|nl|> <|scr|> <|grabo_scr|>",
41
+ prompt_token_file="UniverSLU-17-Task-Specifier/add_tokens-Copy1.txt",
42
+ nbest=1
43
+ )
44
+
45
+ def inference(wav,data):
46
+ with torch.no_grad():
47
+ if data == "english_slurp":
48
+ speech, rate = soundfile.read(wav.name)
49
+ nbests = speech2text_slurp(speech)
50
+ text, *_ = nbests[0]
51
+ # intent=text.split(" ")[0]
52
+ # scenario=intent.split("_")[0]
53
+ # action=intent.split("_")[1]
54
+ # text="{scenario: "+scenario+", action: "+action+"}"
55
+ elif data == "english_fsc":
56
+ print(wav.name)
57
+ speech, rate = soundfile.read(wav.name)
58
+ print(speech.shape)
59
+ if len(speech.shape)==2:
60
+ speech=speech[:,0]
61
+ # soundfile.write("store_file.wav", speech, rate, subtype='FLOAT')
62
+ print(speech.shape)
63
+ nbests = speech2text_fsc(speech)
64
+ text, *_ = nbests[0]
65
+ # intent=text.split(" ")[0]
66
+ # action=intent.split("_")[0]
67
+ # objects=intent.split("_")[1]
68
+ # location=intent.split("_")[2]
69
+ # text="{action: "+action+", object: "+objects+", location: "+location+"}"
70
+ # elif data == "english_snips":
71
+ # print(wav.name)
72
+ # speech, rate = soundfile.read(wav.name)
73
+ # nbests = speech2text_snips(speech)
74
+ # text, *_ = nbests[0]
75
+ elif data == "dutch":
76
+ print(wav.name)
77
+ speech, rate = soundfile.read(wav.name)
78
+ nbests = speech2text_grabo(speech)
79
+ text, *_ = nbests[0]
80
+ # intent=text.split(" ")[0]
81
+ # action=intent.split("_")[0]
82
+ # objects=intent.split("_")[1]
83
+ # location=intent.split("_")[2]
84
+ # text="{action: "+action+", object: "+objects+", location: "+location+"}"
85
+
86
+ # if lang == "chinese":
87
+ # wav = text2speechch(text)["wav"]
88
+ # scipy.io.wavfile.write("out.wav",text2speechch.fs , wav.view(-1).cpu().numpy())
89
+ # if lang == "japanese":
90
+ # wav = text2speechjp(text)["wav"]
91
+ # scipy.io.wavfile.write("out.wav",text2speechjp.fs , wav.view(-1).cpu().numpy())
92
+ return text
93
+
94
+ title = "UniverSLU"
95
+ description = "Gradio demo for UniverSLU: Universal Spoken Language Understanding for Diverse Tasks with Natural Language Instructions. To use it, simply record your audio or click one of the examples to load them. Read more at the links below."
96
+ article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
97
+
98
+ examples=[['audio_slurp.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch"]]
99
+
100
+ # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
101
+ gr.Interface(
102
+ inference,
103
+ [gr.inputs.Audio(label="input audio",source = "microphone", type="file"),gr.inputs.Radio(choices=["english_slurp","english_fsc","dutch_scd"], type="value", default="english_fsc", label="Task")],
104
+ gr.outputs.Textbox(type="str", label="Output"),
105
+ title=title,
106
+ description=description,
107
+ article=article,
108
+ enable_queue=True,
109
+ examples=examples
110
+ ).launch(debug=True)