ssiidd commited on
Commit
af76ccc
·
1 Parent(s): 8f6c44e

Better task descriptions

Browse files
Files changed (1) hide show
  1. app.py +19 -19
app.py CHANGED
@@ -26,7 +26,7 @@ def inference(wav,data):
26
  speech, rate = soundfile.read(wav)
27
  if len(speech.shape)==2:
28
  speech=speech[:,0]
29
- if data == "english_slurp":
30
  speech2text = Speech2Text.from_pretrained(
31
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
32
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -52,7 +52,7 @@ def inference(wav,data):
52
  slot_val=k.split(" FILL ")[1]
53
  text=text+" "+slot_name+" : "+slot_val+","
54
  text=text+"}"
55
- elif data == "english_fsc":
56
  speech2text = Speech2Text.from_pretrained(
57
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
58
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -71,7 +71,7 @@ def inference(wav,data):
71
  objects=intent.split("_")[1]
72
  location=intent.split("_")[2]
73
  text="INTENT: {action: "+action+", object: "+objects+", location: "+location+"}"
74
- elif data == "english_snips":
75
  speech2text = Speech2Text.from_pretrained(
76
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
77
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -87,7 +87,7 @@ def inference(wav,data):
87
  text=text.split("|>")[-1]
88
  intent=text.split(" ")[0].replace("in:","")
89
  text="INTENT: "+intent
90
- elif data == "dutch_scr":
91
  speech2text = Speech2Text.from_pretrained(
92
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
93
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -103,7 +103,7 @@ def inference(wav,data):
103
  text=text.split("|>")[-1]
104
  intent=text.split(" ")[0]
105
  text="SPEECH COMMAND: "+intent
106
- elif data == "english_scr":
107
  speech2text = Speech2Text.from_pretrained(
108
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
109
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -119,7 +119,7 @@ def inference(wav,data):
119
  text=text.split("|>")[-1]
120
  intent=text.split(" ")[0].replace("command:","")
121
  text="SPEECH COMMAND: "+intent
122
- elif data == "lithuanian_scr":
123
  speech2text = Speech2Text.from_pretrained(
124
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
125
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -135,7 +135,7 @@ def inference(wav,data):
135
  text=text.split("|>")[-1]
136
  intent=text
137
  text="SPEECH COMMAND: "+intent
138
- elif data == "arabic_scr":
139
  speech2text = Speech2Text.from_pretrained(
140
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
141
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -151,7 +151,7 @@ def inference(wav,data):
151
  text=text.split("|>")[-1]
152
  intent=text.split(" ")[0].replace("command:","")
153
  text="SPEECH COMMAND: "+intent
154
- elif data == "lid_voxforge":
155
  speech2text = Speech2Text.from_pretrained(
156
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
157
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -166,7 +166,7 @@ def inference(wav,data):
166
  # import pdb;pdb.set_trace()
167
  lang=speech2text.converter.tokenizer.tokenizer.convert_ids_to_tokens(nbests[0][2][0]).replace("|>","").replace("<|","")
168
  text="LANG: "+lang
169
- elif data == "fake_speech_detection_asvspoof":
170
  speech2text = Speech2Text.from_pretrained(
171
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
172
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -182,7 +182,7 @@ def inference(wav,data):
182
  text=text.split("|>")[-1]
183
  intent=text.split(" ")[0].replace("class:","")
184
  text="SPEECH CLASS: "+intent
185
- elif data == "emotion_rec_iemocap":
186
  replace_dict={}
187
  replace_dict["em:neu"]="Neutral"
188
  replace_dict["em:ang"]="Angry"
@@ -203,7 +203,7 @@ def inference(wav,data):
203
  text=text.split("|>")[-1]
204
  intent=replace_dict[text.split(" ")[0]]
205
  text="EMOTION: "+intent
206
- elif data == "accent_classify_accentdb":
207
  speech2text = Speech2Text.from_pretrained(
208
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
209
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -219,7 +219,7 @@ def inference(wav,data):
219
  text=text.split("|>")[-1]
220
  intent=text.split(" ")[0].replace("accent:","")
221
  text="ACCENT: "+intent
222
- elif data == "sarcasm_mustard":
223
  speech2text = Speech2Text.from_pretrained(
224
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
225
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -235,7 +235,7 @@ def inference(wav,data):
235
  text=text.split("|>")[-1]
236
  intent=text.split(" ")[0].replace("class:","")
237
  text="SARCASM CLASS: "+intent
238
- elif data == "sarcasm_mustard_plus":
239
  speech2text = Speech2Text.from_pretrained(
240
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
241
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -251,7 +251,7 @@ def inference(wav,data):
251
  text=text.split("|>")[-1]
252
  intent=text.split(" ")[0].replace("class:","")
253
  text="SARCASM CLASS: "+intent
254
- elif data == "gender_voxceleb1":
255
  speech2text = Speech2Text.from_pretrained(
256
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
257
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -267,7 +267,7 @@ def inference(wav,data):
267
  text=text.split("|>")[-1]
268
  intent=text.split(" ")[0].replace("gender:f","female").replace("gender:m","male")
269
  text="GENDER: "+intent
270
- elif data == "audio_classification_esc50":
271
  speech2text = Speech2Text.from_pretrained(
272
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
273
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -283,7 +283,7 @@ def inference(wav,data):
283
  text=text.split("|>")[-1]
284
  intent=text.split(" ")[0].replace("audio_class:","")
285
  text="AUDIO EVENT CLASS: "+audio_class_arr[int(intent)]
286
- elif data == "semantic_parsing_stop":
287
  speech2text = Speech2Text.from_pretrained(
288
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
289
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -299,7 +299,7 @@ def inference(wav,data):
299
  text, *_ = nbests[0]
300
  text=text.split("|>")[-1].replace("_STOP","")
301
  text="SEMANTIC PARSE SEQUENCE: "+text
302
- elif data == "vad_freesound":
303
  speech2text = Speech2Text.from_pretrained(
304
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
305
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
@@ -328,12 +328,12 @@ title = "UniverSLU"
328
  description = "Gradio demo for UniverSLU Task Specifier (https://huggingface.co/espnet/UniverSLU-17-Task-Specifier). UniverSLU-17 Task Specifier is a Multi-task Spoken Language Understanding model from CMU WAVLab. It adapts Whisper to additional tasks using single-token task specifiers. To use it, simply record your audio or click one of the examples to load them. More details about the SLU tasks that the model is trained on and it's performance on these tasks can be found in our paper: https://aclanthology.org/2024.naacl-long.151/"
329
  article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
330
 
331
- examples=[['audio_slurp_ner.flac',"english_slurp"],['audio_fsc.wav',"english_fsc"],['audio_grabo.wav',"dutch_scr"],['audio_english_scr.wav',"english_scr"],['audio_lt_scr.wav',"lithuanian_scr"],['audio_ar_scr.wav',"arabic_scr"],['audio_snips.wav',"english_snips"],['audio_lid.wav',"lid_voxforge"],['audio_fsd.wav',"fake_speech_detection_asvspoof"],['audio_er.wav',"emotion_rec_iemocap"],['audio_acc.wav',"accent_classify_accentdb"],['audio_mustard.wav',"sarcasm_mustard"],['audio_mustard_plus.wav',"sarcasm_mustard_plus"],['audio_voxceleb1.wav',"gender_voxceleb1"],['audio_esc50.wav',"audio_classification_esc50"],['audio_stop.wav',"semantic_parsing_stop"],['audio_freesound.wav',"vad_freesound"]]
332
 
333
  # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
334
  gr.Interface(
335
  inference,
336
- [gr.Audio(label="input audio",sources=["microphone"],type="filepath"),gr.Radio(choices=["english_slurp","english_fsc","dutch_scr","english_scr","lithuanian_scr","arabic_scr","english_snips","lid_voxforge","fake_speech_detection_asvspoof","emotion_rec_iemocap","accent_classify_accentdb","sarcasm_mustard","sarcasm_mustard_plus","gender_voxceleb1","audio_classification_esc50","semantic_parsing_stop","vad_freesound"], type="value", label="Task")],
337
  gr.Textbox(type="text", label="Output"),
338
  title=title,
339
  cache_examples=False,
 
26
  speech, rate = soundfile.read(wav)
27
  if len(speech.shape)==2:
28
  speech=speech[:,0]
29
+ if data == "English intent classification and named entity recognition task based on the SLURP database":
30
  speech2text = Speech2Text.from_pretrained(
31
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
32
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
52
  slot_val=k.split(" FILL ")[1]
53
  text=text+" "+slot_name+" : "+slot_val+","
54
  text=text+"}"
55
+ elif data == "English intent classification task based on the FSC database":
56
  speech2text = Speech2Text.from_pretrained(
57
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
58
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
71
  objects=intent.split("_")[1]
72
  location=intent.split("_")[2]
73
  text="INTENT: {action: "+action+", object: "+objects+", location: "+location+"}"
74
+ elif data == "English intent classification task based on the SNIPS database":
75
  speech2text = Speech2Text.from_pretrained(
76
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
77
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
87
  text=text.split("|>")[-1]
88
  intent=text.split(" ")[0].replace("in:","")
89
  text="INTENT: "+intent
90
+ elif data == "Dutch speech command recognition task based on the Grabo database":
91
  speech2text = Speech2Text.from_pretrained(
92
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
93
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
103
  text=text.split("|>")[-1]
104
  intent=text.split(" ")[0]
105
  text="SPEECH COMMAND: "+intent
106
+ elif data == "English speech command recognition task based on the Google Speech Commands database":
107
  speech2text = Speech2Text.from_pretrained(
108
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
109
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
119
  text=text.split("|>")[-1]
120
  intent=text.split(" ")[0].replace("command:","")
121
  text="SPEECH COMMAND: "+intent
122
+ elif data == "Lithuanian speech command recognition task based on the Lithuanian SC database":
123
  speech2text = Speech2Text.from_pretrained(
124
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
125
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
135
  text=text.split("|>")[-1]
136
  intent=text
137
  text="SPEECH COMMAND: "+intent
138
+ elif data == "Arabic speech command recognition task based on the Arabic SC database":
139
  speech2text = Speech2Text.from_pretrained(
140
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
141
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
151
  text=text.split("|>")[-1]
152
  intent=text.split(" ")[0].replace("command:","")
153
  text="SPEECH COMMAND: "+intent
154
+ elif data == "Language Identification task based on the VoxForge database":
155
  speech2text = Speech2Text.from_pretrained(
156
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
157
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
166
  # import pdb;pdb.set_trace()
167
  lang=speech2text.converter.tokenizer.tokenizer.convert_ids_to_tokens(nbests[0][2][0]).replace("|>","").replace("<|","")
168
  text="LANG: "+lang
169
+ elif data == "English Fake Speech Detection task based on the ASVSpoof database":
170
  speech2text = Speech2Text.from_pretrained(
171
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
172
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
182
  text=text.split("|>")[-1]
183
  intent=text.split(" ")[0].replace("class:","")
184
  text="SPEECH CLASS: "+intent
185
+ elif data == "English emotion recognition task based on the IEMOCAP database":
186
  replace_dict={}
187
  replace_dict["em:neu"]="Neutral"
188
  replace_dict["em:ang"]="Angry"
 
203
  text=text.split("|>")[-1]
204
  intent=replace_dict[text.split(" ")[0]]
205
  text="EMOTION: "+intent
206
+ elif data == "English accent classification task based on the Accent DB database":
207
  speech2text = Speech2Text.from_pretrained(
208
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
209
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
219
  text=text.split("|>")[-1]
220
  intent=text.split(" ")[0].replace("accent:","")
221
  text="ACCENT: "+intent
222
+ elif data == "English sarcasm detection task based on the MUStARD database":
223
  speech2text = Speech2Text.from_pretrained(
224
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
225
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
235
  text=text.split("|>")[-1]
236
  intent=text.split(" ")[0].replace("class:","")
237
  text="SARCASM CLASS: "+intent
238
+ elif data == "English sarcasm detection task based on the MUStARD++ database":
239
  speech2text = Speech2Text.from_pretrained(
240
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
241
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
251
  text=text.split("|>")[-1]
252
  intent=text.split(" ")[0].replace("class:","")
253
  text="SARCASM CLASS: "+intent
254
+ elif data == "English gender identification task based on the VoxCeleb1 database":
255
  speech2text = Speech2Text.from_pretrained(
256
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
257
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
267
  text=text.split("|>")[-1]
268
  intent=text.split(" ")[0].replace("gender:f","female").replace("gender:m","male")
269
  text="GENDER: "+intent
270
+ elif data == "Audio classification task based on the ESC-50 database":
271
  speech2text = Speech2Text.from_pretrained(
272
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
273
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
283
  text=text.split("|>")[-1]
284
  intent=text.split(" ")[0].replace("audio_class:","")
285
  text="AUDIO EVENT CLASS: "+audio_class_arr[int(intent)]
286
+ elif data == "English semantic parsing task based on the STOP database":
287
  speech2text = Speech2Text.from_pretrained(
288
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
289
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
299
  text, *_ = nbests[0]
300
  text=text.split("|>")[-1].replace("_STOP","")
301
  text="SEMANTIC PARSE SEQUENCE: "+text
302
+ elif data == "Voice activity detection task based on the Google Speech Commands v2 and Freesound database":
303
  speech2text = Speech2Text.from_pretrained(
304
  asr_train_config="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/config.yaml",
305
  asr_model_file="UniverSLU-17-Task-Specifier/exp/asr_train_asr_whisper_full_correct_specaug2_copy_raw_en_whisper_multilingual/valid.acc.ave_10best.pth",
 
328
  description = "Gradio demo for UniverSLU Task Specifier (https://huggingface.co/espnet/UniverSLU-17-Task-Specifier). UniverSLU-17 Task Specifier is a Multi-task Spoken Language Understanding model from CMU WAVLab. It adapts Whisper to additional tasks using single-token task specifiers. To use it, simply record your audio or click one of the examples to load them. More details about the SLU tasks that the model is trained on and it's performance on these tasks can be found in our paper: https://aclanthology.org/2024.naacl-long.151/"
329
  article = "<p style='text-align: center'><a href='https://github.com/espnet/espnet' target='_blank'>Github Repo</a></p>"
330
 
331
+ examples=[['audio_slurp_ner.flac',"English intent classification and named entity recognition task based on the SLURP database"],['audio_fsc.wav',"English intent classification task based on the FSC database"],['audio_grabo.wav',"Dutch speech command recognition task based on the Grabo database"],['audio_english_scr.wav',"English speech command recognition task based on the Google Speech Commands database"],['audio_lt_scr.wav',"Lithuanian speech command recognition task based on the Lithuanian SC database"],['audio_ar_scr.wav',"Arabic speech command recognition task based on the Arabic SC database"],['audio_snips.wav',"English intent classification task based on the SNIPS database"],['audio_lid.wav',"Language Identification task based on the VoxForge database"],['audio_fsd.wav',"English Fake Speech Detection task based on the ASVSpoof database"],['audio_er.wav',"English emotion recognition task based on the IEMOCAP database"],['audio_acc.wav',"English accent classification task based on the Accent DB database"],['audio_mustard.wav',"English sarcasm detection task based on the MUStARD database"],['audio_mustard_plus.wav',"English sarcasm detection task based on the MUStARD++ database"],['audio_voxceleb1.wav',"English gender identification task based on the VoxCeleb1 database"],['audio_esc50.wav',"Audio classification task based on the ESC-50 database"],['audio_stop.wav',"English semantic parsing task based on the STOP database"],['audio_freesound.wav',"Voice activity detection task based on the Google Speech Commands v2 and Freesound database"]]
332
 
333
  # gr.inputs.Textbox(label="input text",lines=10),gr.inputs.Radio(choices=["english"], type="value", default="english", label="language")
334
  gr.Interface(
335
  inference,
336
+ [gr.Audio(label="input audio",sources=["microphone"],type="filepath"),gr.Radio(choices=["English intent classification and named entity recognition task based on the SLURP database","English intent classification task based on the FSC database","Dutch speech command recognition task based on the Grabo database","English speech command recognition task based on the Google Speech Commands database","Lithuanian speech command recognition task based on the Lithuanian SC database","Arabic speech command recognition task based on the Arabic SC database","English intent classification task based on the SNIPS database","Language Identification task based on the VoxForge database","English Fake Speech Detection task based on the ASVSpoof database","English emotion recognition task based on the IEMOCAP database","English accent classification task based on the Accent DB database","English sarcasm detection task based on the MUStARD database","English sarcasm detection task based on the MUStARD++ database","English gender identification task based on the VoxCeleb1 database","Audio classification task based on the ESC-50 database","English semantic parsing task based on the STOP database","Voice activity detection task based on the Google Speech Commands v2 and Freesound database"], type="value", label="Task")],
337
  gr.Textbox(type="text", label="Output"),
338
  title=title,
339
  cache_examples=False,