davidmeikle commited on
Commit
31718a6
·
verified ·
1 Parent(s): 614dc5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -120
app.py CHANGED
@@ -5,16 +5,10 @@ import numpy as np
5
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
6
  import platform
7
  import librosa
8
- import multiprocessing
9
- from dataclasses import dataclass
10
- from typing import Dict, Tuple, List
11
 
12
- @dataclass
13
- class ModelConfig:
14
- name: str
15
- processor: Wav2Vec2Processor
16
- model: Wav2Vec2ForCTC
17
- description: str
18
 
19
  class PhoneticEnhancer:
20
  def __init__(self):
@@ -54,7 +48,7 @@ class PhoneticEnhancer:
54
  vowels = set('aeiouɑɐəæɛɪʊʌɔ')
55
  return any(char in vowels for char in phoneme)
56
 
57
- def _split_into_syllables(self, phonemes: List[str]) -> List[List[str]]:
58
  syllables = []
59
  current_syllable = []
60
 
@@ -72,7 +66,7 @@ class PhoneticEnhancer:
72
 
73
  return syllables
74
 
75
- def enhance_transcription(self, raw_phonemes: str, enhancements: List[str] = None) -> str:
76
  if enhancements is None:
77
  enhancements = ['length', 'quality', 'stress', 'diphthongs']
78
 
@@ -120,117 +114,89 @@ class PhoneticEnhancer:
120
 
121
  return ' '.join(enhanced_phonemes)
122
 
123
- class PhonemeTranscriber:
124
- def __init__(self):
125
- self.device = self._get_optimal_device()
126
- print(f"Using device: {self.device}")
127
-
128
- # Store model name and initialize processor only
129
- self.model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft"
130
- self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
131
- self.target_sample_rate = 16_000
132
- self.enhancer = PhoneticEnhancer()
133
 
134
- def _get_optimal_device(self):
135
- if torch.cuda.is_available():
136
- return "cuda"
137
- elif torch.backends.mps.is_available() and platform.system() == 'Darwin':
138
- return "mps"
139
- return "cpu"
140
-
141
- def preprocess_audio(self, audio):
142
- """Preprocess audio data for model input."""
143
- if isinstance(audio, tuple):
144
- sample_rate, audio_data = audio
145
- else:
146
- return None
147
-
148
- if audio_data.dtype != np.float32:
149
- audio_data = audio_data.astype(np.float32)
150
-
151
- if audio_data.max() > 1.0 or audio_data.min() < -1.0:
152
- audio_data = audio_data / 32768.0
153
-
154
- if len(audio_data.shape) > 1:
155
- audio_data = audio_data.mean(axis=1)
156
-
157
- if sample_rate != self.target_sample_rate:
158
- audio_data = librosa.resample(
159
- y=audio_data,
160
- orig_sr=sample_rate,
161
- target_sr=self.target_sample_rate
162
- )
163
-
164
- return audio_data
165
 
166
- @spaces.GPU
167
- def transcribe_to_phonemes(self, audio, enhancements):
168
- """Transcribe audio to phonemes with enhancements."""
169
- try:
170
- audio_data = self.preprocess_audio(audio)
171
- if audio_data is None:
172
- return "Please provide valid audio input"
173
-
174
- # Load model inside GPU context
175
- model = Wav2Vec2ForCTC.from_pretrained(self.model_name).to(self.device)
176
- model.eval()
177
-
178
- selected_enhancements = enhancements.split(',') if enhancements else []
179
- inputs = self.processor(
180
- audio_data,
181
- sampling_rate=self.target_sample_rate,
182
- return_tensors="pt",
183
- padding=True
184
- ).input_values.to(self.device)
185
-
186
- with torch.no_grad():
187
- logits = model(inputs).logits
188
-
189
- predicted_ids = torch.argmax(logits, dim=-1)
190
- transcription = self.processor.batch_decode(predicted_ids)[0]
191
-
192
- enhanced = self.enhancer.enhance_transcription(
193
- transcription,
194
- selected_enhancements
195
- )
196
-
197
- # Clean up to free GPU memory
198
- del model
199
- if torch.cuda.is_available():
200
- torch.cuda.empty_cache()
201
-
202
- return f"""Raw IPA: {transcription}
203
  Enhanced IPA: {enhanced}
204
  Applied enhancements: {', '.join(selected_enhancements) or 'none'}"""
205
-
206
- except Exception as e:
207
- import traceback
208
- return f"Error processing audio: {str(e)}\n{traceback.format_exc()}"
209
 
210
- if __name__ == "__main__":
211
- multiprocessing.freeze_support()
212
- transcriber = PhonemeTranscriber()
213
- iface = gr.Interface(
214
- fn=transcriber.transcribe_to_phonemes,
215
- inputs=[
216
- gr.Audio(sources=["microphone", "upload"], type="numpy"),
217
- gr.Textbox(
218
- label="Enhancements (comma-separated)",
219
- value="length,quality,stress,diphthongs",
220
- placeholder="e.g., length,quality,stress,diphthongs"
221
- )
222
- ],
223
- outputs="text",
224
- title="Speech to Phoneme Converter - Enhanced IPA",
225
- description=f"""Convert speech to phonemes with customizable IPA enhancements.
226
- Currently using device: {transcriber.device}
227
-
228
- Available enhancements:
229
- - length: Add vowel length markers (ː)
230
- - quality: Adjust vowel quality (e.g., ə æ)
231
- - stress: Add stress marks (ˈ)
232
- - diphthongs: Combine vowels into diphthongs (e.g., ei → eɪ)
233
- """
234
- )
235
-
236
- iface.launch()
 
5
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
6
  import platform
7
  import librosa
 
 
 
8
 
9
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
10
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
11
+ model.to('cuda')
 
 
 
12
 
13
  class PhoneticEnhancer:
14
  def __init__(self):
 
48
  vowels = set('aeiouɑɐəæɛɪʊʌɔ')
49
  return any(char in vowels for char in phoneme)
50
 
51
+ def _split_into_syllables(self, phonemes: list) -> list:
52
  syllables = []
53
  current_syllable = []
54
 
 
66
 
67
  return syllables
68
 
69
+ def enhance_transcription(self, raw_phonemes: str, enhancements: list = None) -> str:
70
  if enhancements is None:
71
  enhancements = ['length', 'quality', 'stress', 'diphthongs']
72
 
 
114
 
115
  return ' '.join(enhanced_phonemes)
116
 
117
+ def preprocess_audio(audio):
118
+ """Preprocess audio data for model input."""
119
+ if isinstance(audio, tuple):
120
+ sample_rate, audio_data = audio
121
+ else:
122
+ return None
 
 
 
 
123
 
124
+ if audio_data.dtype != np.float32:
125
+ audio_data = audio_data.astype(np.float32)
126
+
127
+ if audio_data.max() > 1.0 or audio_data.min() < -1.0:
128
+ audio_data = audio_data / 32768.0
129
+
130
+ if len(audio_data.shape) > 1:
131
+ audio_data = audio_data.mean(axis=1)
132
+
133
+ if sample_rate != 16000:
134
+ audio_data = librosa.resample(
135
+ y=audio_data,
136
+ orig_sr=sample_rate,
137
+ target_sr=16000
138
+ )
139
+
140
+ return audio_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
+ @spaces.GPU
143
+ def transcribe_to_phonemes(audio, enhancements):
144
+ """Transcribe audio to phonemes with enhancements."""
145
+ try:
146
+ audio_data = preprocess_audio(audio)
147
+ if audio_data is None:
148
+ return "Please provide valid audio input"
149
+
150
+ selected_enhancements = enhancements.split(',') if enhancements else []
151
+ inputs = processor(
152
+ audio_data,
153
+ sampling_rate=16000,
154
+ return_tensors="pt",
155
+ padding=True
156
+ ).input_values.to('cuda')
157
+
158
+ with torch.no_grad():
159
+ logits = model(inputs).logits
160
+
161
+ predicted_ids = torch.argmax(logits, dim=-1)
162
+ transcription = processor.batch_decode(predicted_ids)[0]
163
+
164
+ enhancer = PhoneticEnhancer()
165
+ enhanced = enhancer.enhance_transcription(
166
+ transcription,
167
+ selected_enhancements
168
+ )
169
+
170
+ return f"""Raw IPA: {transcription}
 
 
 
 
 
 
 
 
171
  Enhanced IPA: {enhanced}
172
  Applied enhancements: {', '.join(selected_enhancements) or 'none'}"""
173
+
174
+ except Exception as e:
175
+ import traceback
176
+ return f"Error processing audio: {str(e)}\n{traceback.format_exc()}"
177
 
178
+ iface = gr.Interface(
179
+ fn=transcribe_to_phonemes,
180
+ inputs=[
181
+ gr.Audio(sources=["microphone", "upload"], type="numpy"),
182
+ gr.Textbox(
183
+ label="Enhancements (comma-separated)",
184
+ value="length,quality,stress,diphthongs",
185
+ placeholder="e.g., length,quality,stress,diphthongs"
186
+ )
187
+ ],
188
+ outputs="text",
189
+ title="Speech to Phoneme Converter - Enhanced IPA",
190
+ description="""Convert speech to phonemes with customizable IPA enhancements.
191
+
192
+ Available enhancements:
193
+ - length: Add vowel length markers (ː)
194
+ - quality: Adjust vowel quality (e.g., ə → æ)
195
+ - stress: Add stress marks (ˈ)
196
+ - diphthongs: Combine vowels into diphthongs (e.g., ei → eɪ)
197
+
198
+ Example: "piaʒe""piːˈæʒeɪ"
199
+ """
200
+ )
201
+
202
+ iface.launch()