KevinGeng commited on
Commit
9b98548
·
1 Parent(s): 3894058

add conversion for INTELLIGIBILITY SCORE and NATURALNESS SCORE

Browse files
.gitignore ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.pyc
4
+ *.pyo
5
+ *.pyd
6
+ __pycache__/
7
+ *.db
8
+ *.sqlite3
9
+ *.sqlite
10
+ *.log
11
+ *.bak
12
+ *.swp
13
+ *.swo
14
+ *.tmp
15
+ *.tmp.*
16
+ *~
17
+
18
+ # flagged
19
+ flagged/
20
+
21
+ #
22
+ *.wav
app.py CHANGED
@@ -6,6 +6,7 @@ import torch.nn as nn
6
  import lightning_module
7
  import pdb
8
  import jiwer
 
9
 
10
  # ASR part
11
  from transformers import pipeline
@@ -57,6 +58,10 @@ def calc_mos(audio_path, ref):
57
  trans = p(audio_path)["text"]
58
  # WER
59
  wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
 
 
 
 
60
  # MOS
61
  batch = {
62
  'wav': out_wavs,
@@ -66,6 +71,8 @@ def calc_mos(audio_path, ref):
66
  with torch.no_grad():
67
  output = model(batch)
68
  predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
 
 
69
  # Phonemes per minute (PPM)
70
  with torch.no_grad():
71
  logits = phoneme_model(out_wavs).logits
@@ -75,7 +82,7 @@ def calc_mos(audio_path, ref):
75
  wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
76
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
77
 
78
- return predic_mos, trans, wer, phone_transcription, ppm
79
 
80
 
81
  description ="""
@@ -93,9 +100,9 @@ iface = gr.Interface(
93
  fn=calc_mos,
94
  inputs=[gr.Audio(type='filepath', label="Audio to evaluate"),
95
  gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
96
- outputs=[gr.Textbox(placeholder="Naturalness evaluation, ranged 1 to 5, the higher the better.", label="Predicted MOS"),
97
  gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
98
- gr.Textbox(placeholder="Word Error Rate: Only valid when Reference is given", label = "WER"),
99
  gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
100
  gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
101
  title="Laronix's Voice Quality Checking System Demo",
 
6
  import lightning_module
7
  import pdb
8
  import jiwer
9
+ from local.convert_metrics import nat2avaMOS, WER2INTELI
10
 
11
  # ASR part
12
  from transformers import pipeline
 
58
  trans = p(audio_path)["text"]
59
  # WER
60
  wer = jiwer.wer(ref, trans, truth_transform=transformation, hypothesis_transform=transformation)
61
+
62
+ # WER convert to Intellibility score
63
+ INTELI_score = WER2INTELI(wer*100)
64
+
65
  # MOS
66
  batch = {
67
  'wav': out_wavs,
 
71
  with torch.no_grad():
72
  output = model(batch)
73
  predic_mos = output.mean(dim=1).squeeze().detach().numpy()*2 + 3
74
+ # MOS to AVA MOS
75
+ AVA_MOS = nat2avaMOS(predic_mos)
76
  # Phonemes per minute (PPM)
77
  with torch.no_grad():
78
  logits = phoneme_model(out_wavs).logits
 
82
  wav_vad = torchaudio.functional.vad(wav, sample_rate=sr)
83
  ppm = len(lst_phonemes) / (wav_vad.shape[-1] / sr) * 60
84
 
85
+ return AVA_MOS, trans, INTELI_score, phone_transcription, ppm
86
 
87
 
88
  description ="""
 
100
  fn=calc_mos,
101
  inputs=[gr.Audio(type='filepath', label="Audio to evaluate"),
102
  gr.Textbox(placeholder="Input reference here (Don't keep this empty)", label="Reference")],
103
+ outputs=[gr.Textbox(placeholder="Naturalness Score, ranged from 0 to 5, the higher the better.", label="Naturalness Score, ranged from 0 to 5, the higher the better."),
104
  gr.Textbox(placeholder="Hypothesis", label="Hypothesis"),
105
+ gr.Textbox(placeholder="Intelligibility Score", label = "Intelligibility Score, range from 0 to 100, the higher the better"),
106
  gr.Textbox(placeholder="Predicted Phonemes", label="Predicted Phonemes"),
107
  gr.Textbox(placeholder="Speaking Rate, Phonemes per minutes", label="PPM")],
108
  title="Laronix's Voice Quality Checking System Demo",
flagged/log.csv ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Audio to evaluate,Reference,flag,username,timestamp
2
+ /mnt/ssdisk/Laronix_voice_quality_checking_system_FILEIO/flagged/Audio to evaluate/733000b063d55f6ef81b9319254d71e0b9f3575a/tmpvs5jv1ic.wav,"Once upon a time, there was a young rat named Arthur who couldn't make up his mind",,,2023-10-23 15:15:26.722712
3
+ /mnt/ssdisk/Laronix_voice_quality_checking_system_FILEIO/flagged/Audio to evaluate/61f172b4530dab3c975a0f17861216d097df5df1/tmp5ihs8ctx.wav,"Once upon a time, there was a young rat named Arthur who couldn't make up his mind",,,2023-10-23 15:18:15.832797
4
+ /mnt/ssdisk/Laronix_voice_quality_checking_system_FILEIO/flagged/Audio to evaluate/82c854b768a5a5350164fe165b63b40c85d59b26/tmpjyg2550a.wav,"Once upon a time, there was a young rat named Arthur who couldn't make up his mind",,,2023-10-23 15:20:15.140430
local/WER2INTELI.png ADDED
local/convert_metrics.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+
4
+ # Natural MOS to AVA MOS
5
+
6
+ def linear_function(x):
7
+ return 8 * x - 8
8
+
9
+ def quadratic_function(x):
10
+ return -0.0816 * (x - 5) ** 2 + 5
11
+
12
+ # Natural MOS to AVA MOS
13
+ def nat2avaMOS(x):
14
+ if x <= 1.5:
15
+ return linear_function(x)
16
+ elif x >1.5 and x <= 5:
17
+ return quadratic_function(x)
18
+
19
+ # Word error rate to Intellibility Score (X is percentage)
20
+ def WER2INTELI(x):
21
+ if x <= 10:
22
+ return 100
23
+ elif x <= 100:
24
+ slope = (30 - 100) / (100 - 10)
25
+ intercept = 100 - slope * 10
26
+ return slope * x + intercept
27
+ else:
28
+ return 100 * np.exp(-0.01 * (x - 100))
29
+
30
+ # # 生成 x 值
31
+ # x = np.linspace(0, 200, 400) # 从0到200生成400个点
32
+
33
+ # # 计算对应的 y 值
34
+ # y = [WER2INT(xi) for xi in x]
35
+
36
+ # # 绘制函数图像
37
+ # plt.plot(x, y)
38
+ # plt.xlabel('x')
39
+ # plt.ylabel('f(x)')
40
+ # plt.title('Custom Function')
41
+ # plt.grid(True)
42
+ # plt.show()
43
+
44
+ # # 生成 x 值的范围
45
+ # x1 = np.linspace(1, 1.5, 100)
46
+ # x2 = np.linspace(1.5, 5, 100)
47
+
48
+ # # 计算对应的 y 值
49
+ # y1 = linear_function(x1)
50
+ # y2 = quadratic_function(x2)
51
+
52
+ # # 绘制线性部分
53
+ # plt.plot(x1, y1, label='Linear Function (1 <= x <= 1.5)')
54
+
55
+ # # 绘制二次部分
56
+ # plt.plot(x2, y2, label='Quadratic Function (1.5 <= x <= 5)')
57
+
58
+ # # 添加标签和标题
59
+ # plt.xlabel('Natural Mean Opinion Score')
60
+ # plt.ylabel('AVA Mean Opinion Score')
61
+ # plt.title('nat2avaMOS')
62
+
63
+ # # 添加图例
64
+ # plt.legend()
65
+
66
+ # # 显示图形
67
+ # plt.grid(True)
68
+
69
+ # # 显示图像
70
+ # plt.savefig("./local/nat2avaMOS.png")
71
+ # plt.savefig("./local/WER2INT.png")
local/nat2avaMOS.png ADDED