Spaces:
Running
on
Zero
Running
on
Zero
davidmeikle
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -5,16 +5,10 @@ import numpy as np
|
|
5 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
6 |
import platform
|
7 |
import librosa
|
8 |
-
import multiprocessing
|
9 |
-
from dataclasses import dataclass
|
10 |
-
from typing import Dict, Tuple, List
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
processor: Wav2Vec2Processor
|
16 |
-
model: Wav2Vec2ForCTC
|
17 |
-
description: str
|
18 |
|
19 |
class PhoneticEnhancer:
|
20 |
def __init__(self):
|
@@ -54,7 +48,7 @@ class PhoneticEnhancer:
|
|
54 |
vowels = set('aeiouɑɐəæɛɪʊʌɔ')
|
55 |
return any(char in vowels for char in phoneme)
|
56 |
|
57 |
-
def _split_into_syllables(self, phonemes:
|
58 |
syllables = []
|
59 |
current_syllable = []
|
60 |
|
@@ -72,7 +66,7 @@ class PhoneticEnhancer:
|
|
72 |
|
73 |
return syllables
|
74 |
|
75 |
-
def enhance_transcription(self, raw_phonemes: str, enhancements:
|
76 |
if enhancements is None:
|
77 |
enhancements = ['length', 'quality', 'stress', 'diphthongs']
|
78 |
|
@@ -120,117 +114,89 @@ class PhoneticEnhancer:
|
|
120 |
|
121 |
return ' '.join(enhanced_phonemes)
|
122 |
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
self.model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft"
|
130 |
-
self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
|
131 |
-
self.target_sample_rate = 16_000
|
132 |
-
self.enhancer = PhoneticEnhancer()
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
if audio_data.max() > 1.0 or audio_data.min() < -1.0:
|
152 |
-
audio_data = audio_data / 32768.0
|
153 |
-
|
154 |
-
if len(audio_data.shape) > 1:
|
155 |
-
audio_data = audio_data.mean(axis=1)
|
156 |
-
|
157 |
-
if sample_rate != self.target_sample_rate:
|
158 |
-
audio_data = librosa.resample(
|
159 |
-
y=audio_data,
|
160 |
-
orig_sr=sample_rate,
|
161 |
-
target_sr=self.target_sample_rate
|
162 |
-
)
|
163 |
-
|
164 |
-
return audio_data
|
165 |
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
transcription
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
)
|
196 |
-
|
197 |
-
# Clean up to free GPU memory
|
198 |
-
del model
|
199 |
-
if torch.cuda.is_available():
|
200 |
-
torch.cuda.empty_cache()
|
201 |
-
|
202 |
-
return f"""Raw IPA: {transcription}
|
203 |
Enhanced IPA: {enhanced}
|
204 |
Applied enhancements: {', '.join(selected_enhancements) or 'none'}"""
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
iface.launch()
|
|
|
5 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
6 |
import platform
|
7 |
import librosa
|
|
|
|
|
|
|
8 |
|
9 |
+
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
|
10 |
+
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
|
11 |
+
model.to('cuda')
|
|
|
|
|
|
|
12 |
|
13 |
class PhoneticEnhancer:
|
14 |
def __init__(self):
|
|
|
48 |
vowels = set('aeiouɑɐəæɛɪʊʌɔ')
|
49 |
return any(char in vowels for char in phoneme)
|
50 |
|
51 |
+
def _split_into_syllables(self, phonemes: list) -> list:
|
52 |
syllables = []
|
53 |
current_syllable = []
|
54 |
|
|
|
66 |
|
67 |
return syllables
|
68 |
|
69 |
+
def enhance_transcription(self, raw_phonemes: str, enhancements: list = None) -> str:
|
70 |
if enhancements is None:
|
71 |
enhancements = ['length', 'quality', 'stress', 'diphthongs']
|
72 |
|
|
|
114 |
|
115 |
return ' '.join(enhanced_phonemes)
|
116 |
|
117 |
+
def preprocess_audio(audio):
|
118 |
+
"""Preprocess audio data for model input."""
|
119 |
+
if isinstance(audio, tuple):
|
120 |
+
sample_rate, audio_data = audio
|
121 |
+
else:
|
122 |
+
return None
|
|
|
|
|
|
|
|
|
123 |
|
124 |
+
if audio_data.dtype != np.float32:
|
125 |
+
audio_data = audio_data.astype(np.float32)
|
126 |
+
|
127 |
+
if audio_data.max() > 1.0 or audio_data.min() < -1.0:
|
128 |
+
audio_data = audio_data / 32768.0
|
129 |
+
|
130 |
+
if len(audio_data.shape) > 1:
|
131 |
+
audio_data = audio_data.mean(axis=1)
|
132 |
+
|
133 |
+
if sample_rate != 16000:
|
134 |
+
audio_data = librosa.resample(
|
135 |
+
y=audio_data,
|
136 |
+
orig_sr=sample_rate,
|
137 |
+
target_sr=16000
|
138 |
+
)
|
139 |
+
|
140 |
+
return audio_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
+
@spaces.GPU
|
143 |
+
def transcribe_to_phonemes(audio, enhancements):
|
144 |
+
"""Transcribe audio to phonemes with enhancements."""
|
145 |
+
try:
|
146 |
+
audio_data = preprocess_audio(audio)
|
147 |
+
if audio_data is None:
|
148 |
+
return "Please provide valid audio input"
|
149 |
+
|
150 |
+
selected_enhancements = enhancements.split(',') if enhancements else []
|
151 |
+
inputs = processor(
|
152 |
+
audio_data,
|
153 |
+
sampling_rate=16000,
|
154 |
+
return_tensors="pt",
|
155 |
+
padding=True
|
156 |
+
).input_values.to('cuda')
|
157 |
+
|
158 |
+
with torch.no_grad():
|
159 |
+
logits = model(inputs).logits
|
160 |
+
|
161 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
162 |
+
transcription = processor.batch_decode(predicted_ids)[0]
|
163 |
+
|
164 |
+
enhancer = PhoneticEnhancer()
|
165 |
+
enhanced = enhancer.enhance_transcription(
|
166 |
+
transcription,
|
167 |
+
selected_enhancements
|
168 |
+
)
|
169 |
+
|
170 |
+
return f"""Raw IPA: {transcription}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
Enhanced IPA: {enhanced}
|
172 |
Applied enhancements: {', '.join(selected_enhancements) or 'none'}"""
|
173 |
+
|
174 |
+
except Exception as e:
|
175 |
+
import traceback
|
176 |
+
return f"Error processing audio: {str(e)}\n{traceback.format_exc()}"
|
177 |
|
178 |
+
iface = gr.Interface(
|
179 |
+
fn=transcribe_to_phonemes,
|
180 |
+
inputs=[
|
181 |
+
gr.Audio(sources=["microphone", "upload"], type="numpy"),
|
182 |
+
gr.Textbox(
|
183 |
+
label="Enhancements (comma-separated)",
|
184 |
+
value="length,quality,stress,diphthongs",
|
185 |
+
placeholder="e.g., length,quality,stress,diphthongs"
|
186 |
+
)
|
187 |
+
],
|
188 |
+
outputs="text",
|
189 |
+
title="Speech to Phoneme Converter - Enhanced IPA",
|
190 |
+
description="""Convert speech to phonemes with customizable IPA enhancements.
|
191 |
+
|
192 |
+
Available enhancements:
|
193 |
+
- length: Add vowel length markers (ː)
|
194 |
+
- quality: Adjust vowel quality (e.g., ə → æ)
|
195 |
+
- stress: Add stress marks (ˈ)
|
196 |
+
- diphthongs: Combine vowels into diphthongs (e.g., ei → eɪ)
|
197 |
+
|
198 |
+
Example: "piaʒe" → "piːˈæʒeɪ"
|
199 |
+
"""
|
200 |
+
)
|
201 |
+
|
202 |
+
iface.launch()
|
|
|
|