binwang commited on
Commit
f7d283c
·
verified ·
1 Parent(s): fa39fcd

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. app/content.py +145 -132
  2. app/draw_diagram.py +22 -12
  3. app/pages.py +184 -55
  4. app/summarization.py +1 -4
app/content.py CHANGED
@@ -1,143 +1,156 @@
1
 
2
- dataname_column_rename_in_table = {
3
- 'librispeech_test_clean' : 'LibriSpeech-Clean',
4
- 'librispeech_test_other' : 'LibriSpeech-Other',
5
- 'common_voice_15_en_test' : 'CommonVoice-15-EN',
6
- 'peoples_speech_test' : 'Peoples-Speech',
7
- 'gigaspeech_test' : 'GigaSpeech-1',
8
- 'earnings21_test' : 'Earnings-21',
9
- 'earnings22_test' : 'Earnings-22',
10
- 'tedlium3_test' : 'TED-LIUM-3',
11
- 'tedlium3_long_form_test' : 'TED-LIUM-3-Long',
12
- 'aishell_asr_zh_test' : 'Aishell-ASR-ZH',
13
- 'covost2_en_id_test' : 'CoVoST2-EN-ID',
14
- 'covost2_en_zh_test' : 'CoVoST2-EN-ZH',
15
- 'covost2_en_ta_test' : 'CoVoST2-EN-TA',
16
- 'covost2_id_en_test' : 'CoVoST2-ID-EN',
17
- 'covost2_zh_en_test' : 'CoVoST2-ZH-EN',
18
- 'covost2_ta_en_test' : 'CoVoST2-TA-EN',
19
- 'cn_college_listen_mcq_test' : 'CN-College-Listen-MCQ',
20
- 'dream_tts_mcq_test' : 'DREAM-TTS-MCQ',
21
- 'slue_p2_sqa5_test' : 'SLUE-P2-SQA5',
22
- 'public_sg_speech_qa_test' : 'Public-SG-Speech-QA',
23
- 'spoken_squad_test' : 'Spoken-SQuAD',
24
- 'openhermes_audio_test' : 'OpenHermes-Audio',
25
- 'alpaca_audio_test' : 'ALPACA-Audio',
26
- 'wavcaps_test' : 'WavCaps',
27
- 'audiocaps_test' : 'AudioCaps',
28
- 'clotho_aqa_test' : 'Clotho-AQA',
29
- 'wavcaps_qa_test' : 'WavCaps-QA',
30
- 'audiocaps_qa_test' : 'AudioCaps-QA',
31
- 'voxceleb_accent_test' : 'VoxCeleb-Accent',
32
- 'voxceleb_gender_test' : 'VoxCeleb-Gender',
33
- 'iemocap_gender_test' : 'IEMOCAP-Gender',
34
- 'iemocap_emotion_test' : 'IEMOCAP-Emotion',
35
- 'meld_sentiment_test' : 'MELD-Sentiment',
36
- 'meld_emotion_test' : 'MELD-Emotion',
37
- 'imda_part1_asr_test' : 'IMDA-Part1-ASR',
38
- 'imda_part2_asr_test' : 'IMDA-Part2-ASR',
39
- 'imda_part3_30s_asr_test' : 'IMDA-Part3-30s-ASR',
40
- 'imda_part4_30s_asr_test' : 'IMDA-Part4-30s-ASR',
41
- 'imda_part5_30s_asr_test' : 'IMDA-Part5-30s-ASR',
42
- 'imda_part6_30s_asr_test' : 'IMDA-Part6-30s-ASR',
43
- 'muchomusic_test' : 'MuChoMusic',
44
- 'imda_part3_30s_sqa_human_test': 'MNSC-PART3-SQA',
45
- 'imda_part4_30s_sqa_human_test': 'MNSC-PART4-SQA',
46
- 'imda_part5_30s_sqa_human_test': 'MNSC-PART5-SQA',
47
- 'imda_part6_30s_sqa_human_test': 'MNSC-PART6-SQA',
48
-
49
-
50
- }
51
-
52
- asr_datsets = {'LibriSpeech-Test-Clean': 'A clean, high-quality testset of the LibriSpeech dataset, used for ASR testing.',
53
- 'LibriSpeech-Test-Other' : 'A more challenging, noisier testset of the LibriSpeech dataset for ASR testing.',
54
- 'Common-Voice-15-En-Test': 'Test set from the Common Voice project, which is a crowd-sourced, multilingual speech dataset.',
55
- 'Peoples-Speech-Test' : 'A large-scale, open-source speech recognition dataset, with diverse accents and domains.',
56
- 'GigaSpeech-Test' : 'A large-scale ASR dataset with diverse audio sources like podcasts, interviews, etc.',
57
- 'Earnings21-Test' : 'ASR test dataset focused on earnings calls from 2021, with professional speech and financial jargon.',
58
- 'Earnings22-Test' : 'Similar to Earnings21, but covering earnings calls from 2022.',
59
- 'Tedlium3-Test' : 'A test set derived from TED talks, covering diverse speakers and topics.',
60
- 'Tedlium3-Long-form-Test': 'A longer version of the TED-LIUM dataset, containing extended audio samples. This poses challenges to existing fusion methods in handling long audios. However, it provides benchmark for future development.',
61
- }
62
-
63
- singlish_asr_datasets = {
64
- 'IMDA-Part1-ASR-Test' : 'Speech recognition test data from the IMDA NSC project, Part 1.',
65
- 'IMDA-Part2-ASR-Test' : 'Speech recognition test data from the IMDA NSC project, Part 2.',
66
- 'IMDA-Part3-30s-ASR-Test': 'Speech recognition test data from the IMDA NSC project, Part 3.',
67
- 'IMDA-Part4-30s-ASR-Test': 'Speech recognition test data from the IMDA NSC project, Part 4.',
68
- 'IMDA-Part5-30s-ASR-Test': 'Speech recognition test data from the IMDA NSC project, Part 5.',
69
- 'IMDA-Part6-30s-ASR-Test': 'Speech recognition test data from the IMDA NSC project, Part 6.'
70
- }
71
-
72
- sqa_datasets = {'CN-College-Listen-MCQ-Test': 'Chinese College English Listening Test, with multiple-choice questions.',
73
- 'DREAM-TTS-MCQ-Test' : 'DREAM dataset for spoken question-answering, derived from textual data and synthesized speech.',
74
- 'SLUE-P2-SQA5-Test' : 'Spoken Language Understanding Evaluation (SLUE) dataset, part 2, focused on QA tasks.',
75
- 'Public-SG-Speech-QA-Test': 'Public dataset for speech-based question answering, gathered from Singapore.',
76
- 'Spoken-Squad-Test' : 'Spoken SQuAD dataset, based on the textual SQuAD dataset, converted into audio.'
77
- }
78
-
79
- sqa_singlish_datasets = {
80
- 'MNSC-PART3-SQA': 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 3.',
81
- 'MNSC-PART4-SQA': 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 4.',
82
- 'MNSC-PART5-SQA': 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 5.',
83
- 'MNSC-PART6-SQA': 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 6.',
84
- }
85
-
86
- si_datasets = {
87
- 'OpenHermes-Audio-Test': 'Test set for spoken instructions. Synthesized from the OpenHermes dataset.',
88
- 'ALPACA-Audio-Test' : 'Spoken version of the ALPACA dataset, used for evaluating instruction following in audio.'
89
- }
90
-
91
- ac_datasets = {
92
- 'WavCaps-Test' : 'WavCaps is a dataset for testing audio captioning, where models generate textual descriptions of audio clips.',
93
- 'AudioCaps-Test': 'AudioCaps dataset, used for generating captions from general audio events.'
94
- }
95
-
96
- asqa_datasets = {
97
- 'Clotho-AQA-Test' : 'Clotho dataset adapted for audio-based question answering, containing audio clips and questions.',
98
- 'WavCaps-QA-Test' : 'Question-answering test dataset derived from WavCaps, focusing on audio content.',
99
- 'AudioCaps-QA-Test': 'AudioCaps adapted for question-answering tasks, using audio events as input for Q&A.'
100
- }
101
 
102
- er_datasets = {
103
- 'IEMOCAP-Emotion-Test': 'Emotion recognition test data from the IEMOCAP dataset, focusing on identifying emotions in speech.',
104
- 'MELD-Sentiment-Test' : 'Sentiment recognition from speech using the MELD dataset, classifying positive, negative, or neutral sentiments.',
105
- 'MELD-Emotion-Test' : 'Emotion classification in speech using MELD, detecting specific emotions like happiness, anger, etc.'
106
  }
107
 
108
- ar_datsets = {
109
- 'VoxCeleb-Accent-Test': 'Test dataset for accent recognition, based on VoxCeleb, a large speaker identification dataset.'
110
- }
111
-
112
- gr_datasets = {
113
- 'VoxCeleb-Gender-Test': 'Test dataset for gender classification, also derived from VoxCeleb.',
114
- 'IEMOCAP-Gender-Test' : 'Gender classification based on the IEMOCAP dataset.'
115
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
- spt_datasets = {
118
- 'CoVoST2-EN-ID-test': 'CoVoST 2 dataset for speech translation from English to Indonesian.',
119
- 'CoVoST2-EN-ZH-test': 'CoVoST 2 dataset for speech translation from English to Chinese.',
120
- 'CoVoST2-EN-TA-test': 'CoVoST 2 dataset for speech translation from English to Tamil.',
121
- 'CoVoST2-ID-EN-test': 'CoVoST 2 dataset for speech translation from Indonesian to English.',
122
- 'CoVoST2-ZH-EN-test': 'CoVoST 2 dataset for speech translation from Chinese to English.',
123
- 'CoVoST2-TA-EN-test': 'CoVoST 2 dataset for speech translation from Tamil to English.'
124
- }
125
 
126
- cnasr_datasets = {
127
- 'Aishell-ASR-ZH-Test': 'ASR test dataset for Mandarin Chinese, based on the Aishell dataset.'
128
- }
129
 
130
- MUSIC_MCQ_DATASETS = {
131
- 'MuChoMusic-Test': 'Test dataset for music understanding, from paper: MuChoMusic: Evaluating Music Understanding in Multimodal Audio-Language Models.'
132
- }
133
 
134
- metrics = {
135
- 'wer' : 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower, the better)',
136
- 'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
137
- 'llama3_70b_judge' : 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
138
- 'meteor' : 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',
139
- 'bleu' : 'BLEU (Bilingual Evaluation Understudy), another text generation evaluation metric commonly used in machine translation. (Sensitive to output length)',
140
- }
141
 
142
  metrics_info = {
143
  'wer' : 'Word Error Rate (WER) - The Lower, the better.',
 
1
 
2
+ displayname2datasetname = {
3
+ 'LibriSpeech-Clean' : 'librispeech_test_clean',
4
+ 'LibriSpeech-Other' : 'librispeech_test_other',
5
+ 'CommonVoice-15-EN' : 'common_voice_15_en_test',
6
+ 'Peoples-Speech' : 'peoples_speech_test',
7
+ 'GigaSpeech-1' : 'gigaspeech_test',
8
+ 'Earnings-21' : 'earnings21_test',
9
+ 'Earnings-22' : 'earnings22_test',
10
+ 'TED-LIUM-3' : 'tedlium3_test',
11
+ 'TED-LIUM-3-LongForm' : 'tedlium3_long_form_test',
12
+ 'AISHELL-ASR-ZH' : 'aishell_asr_zh_test',
13
+ 'CoVoST2-EN-ID' : 'covost2_en_id_test',
14
+ 'CoVoST2-EN-ZH' : 'covost2_en_zh_test',
15
+ 'CoVoST2-EN-TA' : 'covost2_en_ta_test',
16
+ 'CoVoST2-ID-EN' : 'covost2_id_en_test',
17
+ 'CoVoST2-ZH-EN' : 'covost2_zh_en_test',
18
+ 'CoVoST2-TA-EN' : 'covost2_ta_en_test',
19
+ 'CN-College-Listen-MCQ': 'cn_college_listen_mcq_test',
20
+ 'DREAM-TTS-MCQ' : 'dream_tts_mcq_test',
21
+ 'SLUE-P2-SQA5' : 'slue_p2_sqa5_test',
22
+ 'Public-SG-Speech-QA' : 'public_sg_speech_qa_test',
23
+ 'Spoken-SQuAD' : 'spoken_squad_test',
24
+ 'OpenHermes-Audio' : 'openhermes_audio_test',
25
+ 'ALPACA-Audio' : 'alpaca_audio_test',
26
+ 'WavCaps' : 'wavcaps_test',
27
+ 'AudioCaps' : 'audiocaps_test',
28
+ 'Clotho-AQA' : 'clotho_aqa_test',
29
+ 'WavCaps-QA' : 'wavcaps_qa_test',
30
+ 'AudioCaps-QA' : 'audiocaps_qa_test',
31
+ 'VoxCeleb-Accent' : 'voxceleb_accent_test',
32
+ 'MNSC-AR-Sentence' : 'imda_ar_sentence',
33
+ 'MNSC-AR-Dialogue' : 'imda_ar_dialogue',
34
+ 'VoxCeleb-Gender' : 'voxceleb_gender_test',
35
+ 'IEMOCAP-Gender' : 'iemocap_gender_test',
36
+ 'IEMOCAP-Emotion' : 'iemocap_emotion_test',
37
+ 'MELD-Sentiment' : 'meld_sentiment_test',
38
+ 'MELD-Emotion' : 'meld_emotion_test',
39
+ 'MuChoMusic' : 'muchomusic_test',
40
+ 'MNSC-PART1-ASR' : 'imda_part1_asr_test',
41
+ 'MNSC-PART2-ASR' : 'imda_part2_asr_test',
42
+ 'MNSC-PART3-ASR' : 'imda_part3_30s_asr_test',
43
+ 'MNSC-PART4-ASR' : 'imda_part4_30s_asr_test',
44
+ 'MNSC-PART5-ASR' : 'imda_part5_30s_asr_test',
45
+ 'MNSC-PART6-ASR' : 'imda_part6_30s_asr_test',
46
+ 'MNSC-PART3-SQA' : 'imda_part3_30s_sqa_human_test',
47
+ 'MNSC-PART4-SQA' : 'imda_part4_30s_sqa_human_test',
48
+ 'MNSC-PART5-SQA' : 'imda_part5_30s_sqa_human_test',
49
+ 'MNSC-PART6-SQA' : 'imda_part6_30s_sqa_human_test',
50
+ 'MNSC-PART3-SDS' : 'imda_part3_30s_ds_human_test',
51
+ 'MNSC-PART4-SDS' : 'imda_part4_30s_ds_human_test',
52
+ 'MNSC-PART5-SDS' : 'imda_part5_30s_ds_human_test',
53
+ 'MNSC-PART6-SDS' : 'imda_part6_30s_ds_human_test',
54
+
55
+ 'CNA' : 'cna_test',
56
+ 'IDPC' : 'idpc_test',
57
+ 'Parliament' : 'parliament_test',
58
+ 'UKUS-News' : 'ukusnews_test',
59
+ 'Mediacorp' : 'mediacorp_test',
60
+ 'IDPC-Short' : 'idpc_short_test',
61
+ 'Parliament-Short': 'parliament_short_test',
62
+ 'UKUS-News-Short' : 'ukusnews_short_test',
63
+ 'Mediacorp-Short' : 'mediacorp_short_test',
64
+ 'YTB-ASR-Batch1' : 'ytb_asr_batch1',
65
+ 'YTB-ASR-Batch2' : 'ytb_asr_batch2',
66
+ 'SEAME-Dev-Man' : 'seame_dev_man',
67
+ 'SEAME-Dev-Sge' : 'seame_dev_sge',
68
+
69
+ 'YTB-SQA-Batch1': 'ytb_sqa_batch1',
70
+ 'YTB-SDS-Batch1': 'ytb_sds_batch1',
71
+ 'YTB-PQA-Batch1': 'ytb_pqa_batch1',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
 
 
 
 
73
  }
74
 
75
+ datasetname2diaplayname = {datasetname: displayname for displayname, datasetname in displayname2datasetname.items()}
76
+
77
+
78
+ dataset_diaplay_information = {
79
+ 'LibriSpeech-Clean' : 'A clean, high-quality testset of the LibriSpeech dataset, used for ASR testing.',
80
+ 'LibriSpeech-Other' : 'A more challenging, noisier testset of the LibriSpeech dataset for ASR testing.',
81
+ 'CommonVoice-15-EN' : 'Test set from the Common Voice project, which is a crowd-sourced, multilingual speech dataset.',
82
+ 'Peoples-Speech' : 'A large-scale, open-source speech recognition dataset, with diverse accents and domains.',
83
+ 'GigaSpeech-1' : 'A large-scale ASR dataset with diverse audio sources like podcasts, interviews, etc.',
84
+ 'Earnings-21' : 'ASR test dataset focused on earnings calls from 2021, with professional speech and financial jargon.',
85
+ 'Earnings-22' : 'Similar to Earnings21, but covering earnings calls from 2022.',
86
+ 'TED-LIUM-3' : 'A test set derived from TED talks, covering diverse speakers and topics.',
87
+ 'TED-LIUM-3-LongForm' : 'A longer version of the TED-LIUM dataset, containing extended audio samples. This poses challenges to existing fusion methods in handling long audios. However, it provides benchmark for future development.',
88
+ 'AISHELL-ASR-ZH' : 'ASR test dataset for Mandarin Chinese, based on the Aishell dataset.',
89
+ 'CoVoST2-EN-ID' : 'CoVoST 2 dataset for speech translation from English to Indonesian.',
90
+ 'CoVoST2-EN-ZH' : 'CoVoST 2 dataset for speech translation from English to Chinese.',
91
+ 'CoVoST2-EN-TA' : 'CoVoST 2 dataset for speech translation from English to Tamil.',
92
+ 'CoVoST2-ID-EN' : 'CoVoST 2 dataset for speech translation from Indonesian to English.',
93
+ 'CoVoST2-ZH-EN' : 'CoVoST 2 dataset for speech translation from Chinese to English.',
94
+ 'CoVoST2-TA-EN' : 'CoVoST 2 dataset for speech translation from Tamil to English.',
95
+ 'CN-College-Listen-MCQ': 'Chinese College English Listening Test, with multiple-choice questions.',
96
+ 'DREAM-TTS-MCQ' : 'DREAM dataset for spoken question-answering, derived from textual data and synthesized speech.',
97
+ 'SLUE-P2-SQA5' : 'Spoken Language Understanding Evaluation (SLUE) dataset, part 2, focused on QA tasks.',
98
+ 'Public-SG-Speech-QA' : 'Public dataset for speech-based question answering, gathered from Singapore.',
99
+ 'Spoken-SQuAD' : 'Spoken SQuAD dataset, based on the textual SQuAD dataset, converted into audio.',
100
+ 'OpenHermes-Audio' : 'Test set for spoken instructions. Synthesized from the OpenHermes dataset.',
101
+ 'ALPACA-Audio' : 'Spoken version of the ALPACA dataset, used for evaluating instruction following in audio.',
102
+ 'WavCaps' : 'WavCaps is a dataset for testing audio captioning, where models generate textual descriptions of audio clips.',
103
+ 'AudioCaps' : 'AudioCaps dataset, used for generating captions from general audio events.',
104
+ 'Clotho-AQA' : 'Clotho dataset adapted for audio-based question answering, containing audio clips and questions.',
105
+ 'WavCaps-QA' : 'Question-answering test dataset derived from WavCaps, focusing on audio content.',
106
+ 'AudioCaps-QA' : 'AudioCaps adapted for question-answering tasks, using audio events as input for Q&A.',
107
+ 'VoxCeleb-Accent' : 'Test dataset for accent recognition, based on VoxCeleb, a large speaker identification dataset.',
108
+ 'MNSC-AR-Sentence' : 'Accent recognition based on the IMDA NSC dataset, focusing on sentence-level accents.',
109
+ 'MNSC-AR-Dialogue' : 'Accent recognition based on the IMDA NSC dataset, focusing on dialogue-level accents.',
110
+
111
+ 'VoxCeleb-Gender': 'Test dataset for gender classification, also derived from VoxCeleb.',
112
+ 'IEMOCAP-Gender' : 'Gender classification based on the IEMOCAP dataset.',
113
+ 'IEMOCAP-Emotion': 'Emotion recognition test data from the IEMOCAP dataset, focusing on identifying emotions in speech.',
114
+ 'MELD-Sentiment' : 'Sentiment recognition from speech using the MELD dataset, classifying positive, negative, or neutral sentiments.',
115
+ 'MELD-Emotion' : 'Emotion classification in speech using MELD, detecting specific emotions like happiness, anger, etc.',
116
+ 'MuChoMusic' : 'Test dataset for music understanding, from paper: MuChoMusic: Evaluating Music Understanding in Multimodal Audio-Language Models.',
117
+ 'MNSC-PART1-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 1.',
118
+ 'MNSC-PART2-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 2.',
119
+ 'MNSC-PART3-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 3.',
120
+ 'MNSC-PART4-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 4.',
121
+ 'MNSC-PART5-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 5.',
122
+ 'MNSC-PART6-ASR' : 'Speech recognition test data from the IMDA NSC project, Part 6.',
123
+ 'MNSC-PART3-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 3.',
124
+ 'MNSC-PART4-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 4.',
125
+ 'MNSC-PART5-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 5.',
126
+ 'MNSC-PART6-SQA' : 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 6.',
127
+ 'MNSC-PART3-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 3.',
128
+ 'MNSC-PART4-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 4.',
129
+ 'MNSC-PART5-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 5.',
130
+ 'MNSC-PART6-SDS' : 'Multitak National Speech Corpus (MNSC) dataset, dialogue summarization task, Part 6.',
131
+
132
+ 'CNA' : 'Under Development',
133
+ 'IDPC' : 'Under Development',
134
+ 'Parliament' : 'Under Development',
135
+ 'UKUS-News' : 'Under Development',
136
+ 'Mediacorp' : 'Under Development',
137
+ 'IDPC-Short' : 'Under Development',
138
+ 'Parliament-Short': 'Under Development',
139
+ 'UKUS-News-Short' : 'Under Development',
140
+ 'Mediacorp-Short' : 'Under Development',
141
+ 'YTB-ASR-Batch1' : 'Under Development',
142
+ 'YTB-ASR-Batch2' : 'Under Development',
143
+ 'SEAME-Dev-Man' : 'Under Development',
144
+ 'SEAME-Dev-Sge' : 'Under Development',
145
+
146
+ 'YTB-SQA-Batch1': 'Under Development',
147
+ 'YTB-SDS-Batch1': 'Under Development',
148
+ 'YTB-PQA-Batch1': 'Under Development',
149
 
150
+ }
 
 
 
 
 
 
 
151
 
 
 
 
152
 
 
 
 
153
 
 
 
 
 
 
 
 
154
 
155
  metrics_info = {
156
  'wer' : 'Word Error Rate (WER) - The Lower, the better.',
app/draw_diagram.py CHANGED
@@ -2,32 +2,29 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  from streamlit_echarts import st_echarts
5
- from streamlit.components.v1 import html
6
- # from PIL import Image
7
  from app.show_examples import *
8
  from app.content import *
 
9
  import pandas as pd
10
 
11
  from model_information import get_dataframe
12
-
13
-
14
-
15
  info_df = get_dataframe()
16
 
17
 
18
- def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
19
 
20
  folder = f"./results_organized/{metrics}/"
21
 
22
  # Load the results from CSV
23
  data_path = f'{folder}/{category_name.lower()}.csv'
24
  chart_data = pd.read_csv(data_path).round(3)
25
- new_dataset_name = dataset_name.replace('-', '_').lower()
26
- chart_data = chart_data[['Model', new_dataset_name]]
 
27
 
28
  # Rename to proper display name
29
- new_dataset_name = dataname_column_rename_in_table[new_dataset_name]
30
- chart_data = chart_data.rename(columns=dataname_column_rename_in_table)
31
 
32
  st.markdown("""
33
  <style>
@@ -52,7 +49,7 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
52
  )
53
 
54
  chart_data = chart_data[chart_data['model_show'].isin(models)]
55
- chart_data = chart_data.sort_values(by=[new_dataset_name], ascending=cus_sort).dropna(axis=0)
56
 
57
  if len(chart_data) == 0: return
58
 
@@ -103,6 +100,19 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
103
  'IMDA-Part4-30s-ASR',
104
  'IMDA-Part5-30s-ASR',
105
  'IMDA-Part6-30s-ASR',
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  ]:
107
 
108
  chart_data_table = chart_data_table.sort_values(
@@ -203,7 +213,7 @@ def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
203
  "series": [{
204
  "name": f"{dataset_name}",
205
  "type": "bar",
206
- "data": chart_data[f'{new_dataset_name}'].tolist(),
207
  }],
208
  }
209
 
 
2
  import pandas as pd
3
  import numpy as np
4
  from streamlit_echarts import st_echarts
 
 
5
  from app.show_examples import *
6
  from app.content import *
7
+
8
  import pandas as pd
9
 
10
  from model_information import get_dataframe
 
 
 
11
  info_df = get_dataframe()
12
 
13
 
14
+ def draw(folder_name, category_name, displayname, metrics, cus_sort=True):
15
 
16
  folder = f"./results_organized/{metrics}/"
17
 
18
  # Load the results from CSV
19
  data_path = f'{folder}/{category_name.lower()}.csv'
20
  chart_data = pd.read_csv(data_path).round(3)
21
+
22
+ dataset_name = displayname2datasetname[displayname]
23
+ chart_data = chart_data[['Model', dataset_name]]
24
 
25
  # Rename to proper display name
26
+ chart_data = chart_data.rename(columns=datasetname2diaplayname)
27
+
28
 
29
  st.markdown("""
30
  <style>
 
49
  )
50
 
51
  chart_data = chart_data[chart_data['model_show'].isin(models)]
52
+ chart_data = chart_data.sort_values(by=[displayname], ascending=cus_sort).dropna(axis=0)
53
 
54
  if len(chart_data) == 0: return
55
 
 
100
  'IMDA-Part4-30s-ASR',
101
  'IMDA-Part5-30s-ASR',
102
  'IMDA-Part6-30s-ASR',
103
+ 'CNA',
104
+ 'IDPC',
105
+ 'Parliament',
106
+ 'UKUS-News',
107
+ 'Mediacorp',
108
+ 'IDPC-Short',
109
+ 'Parliament-Short',
110
+ 'UKUS-News-Short',
111
+ 'Mediacorp-Short',
112
+ 'YTB-ASR-Batch1',
113
+ 'YTB-ASR-Batch2',
114
+ 'SEAME-Dev-Man',
115
+ 'SEAME-Dev-Sge',
116
  ]:
117
 
118
  chart_data_table = chart_data_table.sort_values(
 
213
  "series": [{
214
  "name": f"{dataset_name}",
215
  "type": "bar",
216
+ "data": chart_data[f'{displayname}'].tolist(),
217
  }],
218
  }
219
 
app/pages.py CHANGED
@@ -75,7 +75,7 @@ def dashboard():
75
 
76
  st.divider()
77
  with st.container():
78
- left_co, right_co = st.columns([1, 0.7])
79
 
80
  with left_co:
81
  st.markdown("""
@@ -88,25 +88,52 @@ def dashboard():
88
  year={2024}
89
  }
90
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  """)
92
 
93
 
94
 
95
 
 
 
 
96
  def asr_english():
97
  st.title("Task: Automatic Speech Recognition - English")
98
 
99
  sum = ['Overall']
100
  dataset_lists = [
101
- 'LibriSpeech-Test-Clean',
102
- 'LibriSpeech-Test-Other',
103
- 'Common-Voice-15-En-Test',
104
- 'Peoples-Speech-Test',
105
- 'GigaSpeech-Test',
106
- 'Earnings21-Test',
107
- 'Earnings22-Test',
108
- 'Tedlium3-Test',
109
- 'Tedlium3-Long-form-Test',
110
  ]
111
 
112
  filters_levelone = sum + dataset_lists
@@ -120,7 +147,7 @@ def asr_english():
120
  if filter_1 in sum:
121
  sum_table_mulit_metrix('asr_english', ['wer'])
122
  else:
123
- dataset_contents(asr_datsets[filter_1], metrics['wer'])
124
  draw('su', 'asr_english', filter_1, 'wer', cus_sort=True)
125
 
126
 
@@ -132,12 +159,12 @@ def asr_singlish():
132
 
133
  sum = ['Overall']
134
  dataset_lists = [
135
- 'IMDA-Part1-ASR-Test',
136
- 'IMDA-Part2-ASR-Test',
137
- 'IMDA-Part3-30s-ASR-Test',
138
- 'IMDA-Part4-30s-ASR-Test',
139
- 'IMDA-Part5-30s-ASR-Test',
140
- 'IMDA-Part6-30s-ASR-Test',
141
  ]
142
 
143
  filters_levelone = sum + dataset_lists
@@ -151,7 +178,7 @@ def asr_singlish():
151
  if filter_1 in sum:
152
  sum_table_mulit_metrix('asr_singlish', ['wer'])
153
  else:
154
- dataset_contents(singlish_asr_datasets[filter_1], metrics['wer'])
155
  draw('su', 'asr_singlish', filter_1, 'wer')
156
 
157
 
@@ -162,7 +189,7 @@ def asr_mandarin():
162
 
163
  sum = ['Overall']
164
  dataset_lists = [
165
- 'Aishell-ASR-ZH-Test',
166
  ]
167
 
168
  filters_levelone = sum + dataset_lists
@@ -176,7 +203,7 @@ def asr_mandarin():
176
  if filter_1 in sum:
177
  sum_table_mulit_metrix('asr_mandarin', ['wer'])
178
  else:
179
- dataset_contents(cnasr_datasets[filter_1], metrics['wer'])
180
  draw('su', 'asr_mandarin', filter_1, 'wer')
181
 
182
 
@@ -187,12 +214,12 @@ def speech_translation():
187
 
188
  sum = ['Overall']
189
  dataset_lists = [
190
- 'CoVoST2-EN-ID-test',
191
- 'CoVoST2-EN-ZH-test',
192
- 'CoVoST2-EN-TA-test',
193
- 'CoVoST2-ID-EN-test',
194
- 'CoVoST2-ZH-EN-test',
195
- 'CoVoST2-TA-EN-test']
196
 
197
  filters_levelone = sum + dataset_lists
198
 
@@ -205,7 +232,7 @@ def speech_translation():
205
  if filter_1 in sum:
206
  sum_table_mulit_metrix('st', ['bleu'])
207
  else:
208
- dataset_contents(spt_datasets[filter_1], metrics['bleu'])
209
  draw('su', 'ST', filter_1, 'bleu')
210
 
211
 
@@ -217,11 +244,11 @@ def speech_question_answering_english():
217
  sum = ['Overall']
218
 
219
  dataset_lists = [
220
- 'CN-College-Listen-MCQ-Test',
221
- 'DREAM-TTS-MCQ-Test',
222
- 'SLUE-P2-SQA5-Test',
223
- 'Public-SG-Speech-QA-Test',
224
- 'Spoken-Squad-Test',
225
  ]
226
 
227
  filters_levelone = sum + dataset_lists
@@ -240,7 +267,7 @@ def speech_question_answering_english():
240
  # draw('su', 'SQA', filter_1, 'llama3_70b_judge')
241
 
242
  else:
243
- dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
244
  draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
245
 
246
 
@@ -271,10 +298,39 @@ def speech_question_answering_singlish():
271
  sum_table_mulit_metrix('sqa_singlish', ['llama3_70b_judge'])
272
 
273
  else:
274
- dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
275
  draw('su', 'sqa_singlish', filter_1, 'llama3_70b_judge')
276
 
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
 
280
  def speech_instruction():
@@ -282,8 +338,8 @@ def speech_instruction():
282
 
283
  sum = ['Overall']
284
 
285
- dataset_lists = ['OpenHermes-Audio-Test',
286
- 'ALPACA-Audio-Test',
287
  ]
288
 
289
  filters_levelone = sum + dataset_lists
@@ -297,7 +353,7 @@ def speech_instruction():
297
  if filter_1 in sum:
298
  sum_table_mulit_metrix('speech_instruction', ['llama3_70b_judge'])
299
  else:
300
- dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge'])
301
  draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
302
 
303
 
@@ -306,8 +362,8 @@ def speech_instruction():
306
  def audio_captioning():
307
  st.title("Task: Audio Captioning")
308
 
309
- filters_levelone = ['WavCaps-Test',
310
- 'AudioCaps-Test',
311
  ]
312
  filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
313
 
@@ -319,7 +375,7 @@ def audio_captioning():
319
  metric = st.selectbox('Metric', filters_leveltwo)
320
 
321
  if filter_1 or metric:
322
- dataset_contents(ac_datasets[filter_1], metrics[metric.lower().replace('-', '_')])
323
  draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
324
 
325
 
@@ -330,9 +386,9 @@ def audio_scene_question_answering():
330
 
331
  sum = ['Overall']
332
 
333
- dataset_lists = ['Clotho-AQA-Test',
334
- 'WavCaps-QA-Test',
335
- 'AudioCaps-QA-Test']
336
 
337
  filters_levelone = sum + dataset_lists
338
 
@@ -345,7 +401,7 @@ def audio_scene_question_answering():
345
  if filter_1 in sum:
346
  sum_table_mulit_metrix('audio_scene_question_answering', ['llama3_70b_judge'])
347
  else:
348
- dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
349
  draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
350
 
351
 
@@ -357,9 +413,9 @@ def emotion_recognition():
357
  sum = ['Overall']
358
 
359
  dataset_lists = [
360
- 'IEMOCAP-Emotion-Test',
361
- 'MELD-Sentiment-Test',
362
- 'MELD-Emotion-Test',
363
  ]
364
 
365
  filters_levelone = sum + dataset_lists
@@ -373,7 +429,7 @@ def emotion_recognition():
373
  if filter_1 in sum:
374
  sum_table_mulit_metrix('emotion_recognition', ['llama3_70b_judge'])
375
  else:
376
- dataset_contents(er_datasets[filter_1], metrics['llama3_70b_judge'])
377
  draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
378
 
379
 
@@ -383,7 +439,11 @@ def accent_recognition():
383
  st.title("Task: Accent Recognition")
384
 
385
  sum = ['Overall']
386
- dataset_lists = ['VoxCeleb-Accent-Test']
 
 
 
 
387
 
388
 
389
  filters_levelone = sum + dataset_lists
@@ -398,7 +458,7 @@ def accent_recognition():
398
  if filter_1 in sum:
399
  sum_table_mulit_metrix('accent_recognition', ['llama3_70b_judge'])
400
  else:
401
- dataset_contents(ar_datsets[filter_1], metrics['llama3_70b_judge'])
402
  draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
403
 
404
 
@@ -409,8 +469,10 @@ def gender_recognition():
409
 
410
  sum = ['Overall']
411
 
412
- dataset_lists = ['VoxCeleb-Gender-Test',
413
- 'IEMOCAP-Gender-Test']
 
 
414
 
415
  filters_levelone = sum + dataset_lists
416
 
@@ -423,7 +485,7 @@ def gender_recognition():
423
  if filter_1 in sum:
424
  sum_table_mulit_metrix('gender_recognition', ['llama3_70b_judge'])
425
  else:
426
- dataset_contents(gr_datasets[filter_1], metrics['llama3_70b_judge'])
427
  draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
428
 
429
 
@@ -434,7 +496,7 @@ def music_understanding():
434
 
435
  sum = ['Overall']
436
 
437
- dataset_lists = ['MuChoMusic-Test',
438
  ]
439
 
440
  filters_levelone = sum + dataset_lists
@@ -448,7 +510,7 @@ def music_understanding():
448
  if filter_1 in sum:
449
  sum_table_mulit_metrix('music_understanding', ['llama3_70b_judge'])
450
  else:
451
- dataset_contents(MUSIC_MCQ_DATASETS[filter_1], metrics['llama3_70b_judge'])
452
  draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
453
 
454
 
@@ -457,3 +519,70 @@ def music_understanding():
457
 
458
 
459
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  st.divider()
77
  with st.container():
78
+ left_co, right_co = st.columns([1, 0.1])
79
 
80
  with left_co:
81
  st.markdown("""
 
88
  year={2024}
89
  }
90
  ```
91
+ ```
92
+ @article{wang2025advancing,
93
+ title={Advancing Singlish Understanding: Bridging the Gap with Datasets and Multimodal Models},
94
+ author={Wang, Bin and Zou, Xunlong and Sun, Shuo and Zhang, Wenyu and He, Yingxu and Liu, Zhuohan and Wei, Chengwei and Chen, Nancy F and Aw, AiTi},
95
+ journal={arXiv preprint arXiv:2501.01034},
96
+ year={2025}
97
+ }
98
+ ```
99
+ ```
100
+ @article{he2024meralion,
101
+ title={MERaLiON-AudioLLM: Technical Report},
102
+ author={He, Yingxu and Liu, Zhuohan and Sun, Shuo and Wang, Bin and Zhang, Wenyu and Zou, Xunlong and Chen, Nancy F and Aw, Ai Ti},
103
+ journal={arXiv preprint arXiv:2412.09818},
104
+ year={2024}
105
+ }
106
+ ```
107
+ ```
108
+ @article{zhang2024mowe,
109
+ title={MoWE-Audio: Multitask AudioLLMs with Mixture of Weak Encoders},
110
+ author={Zhang, Wenyu and Sun, Shuo and Wang, Bin and Zou, Xunlong and Liu, Zhuohan and He, Yingxu and Lin, Geyu and Chen, Nancy F and Aw, Ai Ti},
111
+ journal={ICASSP},
112
+ year={2025}
113
+ }
114
+ ```
115
  """)
116
 
117
 
118
 
119
 
120
+
121
+
122
+
123
  def asr_english():
124
  st.title("Task: Automatic Speech Recognition - English")
125
 
126
  sum = ['Overall']
127
  dataset_lists = [
128
+ 'LibriSpeech-Clean',
129
+ 'LibriSpeech-Other',
130
+ 'CommonVoice-15-EN',
131
+ 'Peoples-Speech',
132
+ 'GigaSpeech-1',
133
+ 'Earnings-21',
134
+ 'Earnings-22',
135
+ 'TED-LIUM-3',
136
+ 'TED-LIUM-3-LongForm',
137
  ]
138
 
139
  filters_levelone = sum + dataset_lists
 
147
  if filter_1 in sum:
148
  sum_table_mulit_metrix('asr_english', ['wer'])
149
  else:
150
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
151
  draw('su', 'asr_english', filter_1, 'wer', cus_sort=True)
152
 
153
 
 
159
 
160
  sum = ['Overall']
161
  dataset_lists = [
162
+ 'MNSC-PART1-ASR',
163
+ 'MNSC-PART2-ASR',
164
+ 'MNSC-PART3-ASR',
165
+ 'MNSC-PART4-ASR',
166
+ 'MNSC-PART5-ASR',
167
+ 'MNSC-PART6-ASR',
168
  ]
169
 
170
  filters_levelone = sum + dataset_lists
 
178
  if filter_1 in sum:
179
  sum_table_mulit_metrix('asr_singlish', ['wer'])
180
  else:
181
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
182
  draw('su', 'asr_singlish', filter_1, 'wer')
183
 
184
 
 
189
 
190
  sum = ['Overall']
191
  dataset_lists = [
192
+ 'AISHELL-ASR-ZH',
193
  ]
194
 
195
  filters_levelone = sum + dataset_lists
 
203
  if filter_1 in sum:
204
  sum_table_mulit_metrix('asr_mandarin', ['wer'])
205
  else:
206
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer'])
207
  draw('su', 'asr_mandarin', filter_1, 'wer')
208
 
209
 
 
214
 
215
  sum = ['Overall']
216
  dataset_lists = [
217
+ 'CoVoST2-EN-ID',
218
+ 'CoVoST2-EN-ZH',
219
+ 'CoVoST2-EN-TA',
220
+ 'CoVoST2-ID-EN',
221
+ 'CoVoST2-ZH-EN',
222
+ 'CoVoST2-TA-EN']
223
 
224
  filters_levelone = sum + dataset_lists
225
 
 
232
  if filter_1 in sum:
233
  sum_table_mulit_metrix('st', ['bleu'])
234
  else:
235
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['bleu'])
236
  draw('su', 'ST', filter_1, 'bleu')
237
 
238
 
 
244
  sum = ['Overall']
245
 
246
  dataset_lists = [
247
+ 'CN-College-Listen-MCQ',
248
+ 'DREAM-TTS-MCQ',
249
+ 'SLUE-P2-SQA5',
250
+ 'Public-SG-Speech-QA',
251
+ 'Spoken-SQuAD',
252
  ]
253
 
254
  filters_levelone = sum + dataset_lists
 
267
  # draw('su', 'SQA', filter_1, 'llama3_70b_judge')
268
 
269
  else:
270
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
271
  draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
272
 
273
 
 
298
  sum_table_mulit_metrix('sqa_singlish', ['llama3_70b_judge'])
299
 
300
  else:
301
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
302
  draw('su', 'sqa_singlish', filter_1, 'llama3_70b_judge')
303
 
304
 
305
+ def spoken_dialogue_summarization_singlish():
306
+ st.title("Task: Spoken Dialogue Summarization - Singlish")
307
+
308
+ sum = ['Overall']
309
+
310
+ dataset_lists = [
311
+ 'MNSC-PART3-SDS',
312
+ 'MNSC-PART4-SDS',
313
+ 'MNSC-PART5-SDS',
314
+ 'MNSC-PART6-SDS',
315
+ ]
316
+
317
+
318
+ filters_levelone = sum + dataset_lists
319
+
320
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
321
+
322
+ with left:
323
+ filter_1 = st.selectbox('Dataset', filters_levelone)
324
+
325
+ if filter_1:
326
+ if filter_1 in sum:
327
+ sum_table_mulit_metrix('sds_singlish', ['llama3_70b_judge'])
328
+
329
+ else:
330
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
331
+ draw('su', 'sds_singlish', filter_1, 'llama3_70b_judge')
332
+
333
+
334
 
335
 
336
  def speech_instruction():
 
338
 
339
  sum = ['Overall']
340
 
341
+ dataset_lists = ['OpenHermes-Audio',
342
+ 'ALPACA-Audio',
343
  ]
344
 
345
  filters_levelone = sum + dataset_lists
 
353
  if filter_1 in sum:
354
  sum_table_mulit_metrix('speech_instruction', ['llama3_70b_judge'])
355
  else:
356
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
357
  draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
358
 
359
 
 
362
  def audio_captioning():
363
  st.title("Task: Audio Captioning")
364
 
365
+ filters_levelone = ['WavCaps',
366
+ 'AudioCaps',
367
  ]
368
  filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
369
 
 
375
  metric = st.selectbox('Metric', filters_leveltwo)
376
 
377
  if filter_1 or metric:
378
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info[metric.lower().replace('-', '_')])
379
  draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
380
 
381
 
 
386
 
387
  sum = ['Overall']
388
 
389
+ dataset_lists = ['Clotho-AQA',
390
+ 'WavCaps-QA',
391
+ 'AudioCaps-QA']
392
 
393
  filters_levelone = sum + dataset_lists
394
 
 
401
  if filter_1 in sum:
402
  sum_table_mulit_metrix('audio_scene_question_answering', ['llama3_70b_judge'])
403
  else:
404
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
405
  draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
406
 
407
 
 
413
  sum = ['Overall']
414
 
415
  dataset_lists = [
416
+ 'IEMOCAP-Emotion',
417
+ 'MELD-Sentiment',
418
+ 'MELD-Emotion',
419
  ]
420
 
421
  filters_levelone = sum + dataset_lists
 
429
  if filter_1 in sum:
430
  sum_table_mulit_metrix('emotion_recognition', ['llama3_70b_judge'])
431
  else:
432
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
433
  draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
434
 
435
 
 
439
  st.title("Task: Accent Recognition")
440
 
441
  sum = ['Overall']
442
+ dataset_lists = [
443
+ 'VoxCeleb-Accent',
444
+ 'MNSC-AR-Sentence',
445
+ 'MNSC-AR-Dialogue',
446
+ ]
447
 
448
 
449
  filters_levelone = sum + dataset_lists
 
458
  if filter_1 in sum:
459
  sum_table_mulit_metrix('accent_recognition', ['llama3_70b_judge'])
460
  else:
461
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
462
  draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
463
 
464
 
 
469
 
470
  sum = ['Overall']
471
 
472
+ dataset_lists = [
473
+ 'VoxCeleb-Gender',
474
+ 'IEMOCAP-Gender'
475
+ ]
476
 
477
  filters_levelone = sum + dataset_lists
478
 
 
485
  if filter_1 in sum:
486
  sum_table_mulit_metrix('gender_recognition', ['llama3_70b_judge'])
487
  else:
488
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
489
  draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
490
 
491
 
 
496
 
497
  sum = ['Overall']
498
 
499
+ dataset_lists = ['MuChoMusic',
500
  ]
501
 
502
  filters_levelone = sum + dataset_lists
 
510
  if filter_1 in sum:
511
  sum_table_mulit_metrix('music_understanding', ['llama3_70b_judge'])
512
  else:
513
+ dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge'])
514
  draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
515
 
516
 
 
519
 
520
 
521
 
522
+
523
+
524
+
525
+ def under_development():
526
+ st.title("Task: Under Development")
527
+
528
+
529
+ dataset_lists = [
530
+ 'CNA',
531
+ 'IDPC',
532
+ 'Parliament',
533
+ 'UKUS-News',
534
+ 'Mediacorp',
535
+ 'IDPC-Short',
536
+ 'Parliament-Short',
537
+ 'UKUS-News-Short',
538
+ 'Mediacorp-Short',
539
+
540
+ 'YTB-ASR-Batch1',
541
+ 'YTB-ASR-Batch2',
542
+ 'SEAME-Dev-Man',
543
+ 'SEAME-Dev-Sge',
544
+
545
+ 'YTB-SQA-Batch1',
546
+ 'YTB-SDS-Batch1',
547
+ 'YTB-PQA-Batch1',
548
+
549
+ ]
550
+
551
+ filters_levelone = dataset_lists
552
+
553
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
554
+
555
+ with left:
556
+ filter_1 = st.selectbox('Dataset', filters_levelone)
557
+
558
+ dataset_contents(dataset_diaplay_information[filter_1], 'under_development')
559
+
560
+ if filter_1 in [
561
+ 'CNA',
562
+ 'IDPC',
563
+ 'Parliament',
564
+ 'UKUS-News',
565
+ 'Mediacorp',
566
+ 'IDPC-Short',
567
+ 'Parliament-Short',
568
+ 'UKUS-News-Short',
569
+ 'Mediacorp-Short',
570
+ 'YTB-ASR-Batch1',
571
+ 'YTB-ASR-Batch2',
572
+ 'SEAME-Dev-Man',
573
+ 'SEAME-Dev-Sge',
574
+ ]:
575
+
576
+ draw('vu', 'under_development_wer', filter_1, 'wer')
577
+
578
+ elif filter_1 in [
579
+ 'YTB-SQA-Batch1',
580
+ 'YTB-SDS-Batch1',
581
+ 'YTB-PQA-Batch1',
582
+ ]:
583
+ draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge')
584
+
585
+
586
+
587
+
588
+
app/summarization.py CHANGED
@@ -14,7 +14,6 @@ from model_information import get_dataframe
14
 
15
  info_df = get_dataframe()
16
 
17
- metrics_info = metrics_info
18
 
19
  def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
20
 
@@ -34,7 +33,7 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
34
  chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
35
 
36
  # Update dataset name in table
37
- chart_data = chart_data.rename(columns=dataname_column_rename_in_table)
38
 
39
  st.markdown("""
40
  <style>
@@ -55,11 +54,9 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
55
  models = st.multiselect("Please choose the model",
56
  sorted(chart_data['model_show'].tolist()),
57
  default = sorted(chart_data['model_show'].tolist()),
58
- # key=f"multiselect_{task_name}_{metrics}"
59
  )
60
 
61
  chart_data = chart_data[chart_data['model_show'].isin(models)].dropna(axis=0)
62
- # chart_data = chart_data.sort_values(by=['Average'], ascending=True).dropna(axis=0)
63
 
64
  if len(chart_data) == 0: return
65
 
 
14
 
15
  info_df = get_dataframe()
16
 
 
17
 
18
  def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
19
 
 
33
  chart_data['Average'] = chart_data[selected_columns].mean(axis=1)
34
 
35
  # Update dataset name in table
36
+ chart_data = chart_data.rename(columns=datasetname2diaplayname)
37
 
38
  st.markdown("""
39
  <style>
 
54
  models = st.multiselect("Please choose the model",
55
  sorted(chart_data['model_show'].tolist()),
56
  default = sorted(chart_data['model_show'].tolist()),
 
57
  )
58
 
59
  chart_data = chart_data[chart_data['model_show'].isin(models)].dropna(axis=0)
 
60
 
61
  if len(chart_data) == 0: return
62