Spaces:

davidmeikle
/

phoneme-recorder

Running on Zero

App Files Files Community

davidmeikle commited on Dec 11, 2024

Commit

31718a6

verified ·

1 Parent(s): 614dc5d

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -120

app.py CHANGED Viewed

@@ -5,16 +5,10 @@ import numpy as np
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 import platform
 import librosa
-import multiprocessing
-from dataclasses import dataclass
-from typing import Dict, Tuple, List
-@dataclass
-class ModelConfig:
-    name: str
-    processor: Wav2Vec2Processor
-    model: Wav2Vec2ForCTC
-    description: str
 class PhoneticEnhancer:
     def __init__(self):
@@ -54,7 +48,7 @@ class PhoneticEnhancer:
         vowels = set('aeiouɑɐəæɛɪʊʌɔ')
         return any(char in vowels for char in phoneme)
-    def _split_into_syllables(self, phonemes: List[str]) -> List[List[str]]:
         syllables = []
         current_syllable = []
@@ -72,7 +66,7 @@ class PhoneticEnhancer:
         return syllables
-    def enhance_transcription(self, raw_phonemes: str, enhancements: List[str] = None) -> str:
         if enhancements is None:
             enhancements = ['length', 'quality', 'stress', 'diphthongs']
@@ -120,117 +114,89 @@ class PhoneticEnhancer:
         return ' '.join(enhanced_phonemes)
-class PhonemeTranscriber:
-    def __init__(self):
-        self.device = self._get_optimal_device()
-        print(f"Using device: {self.device}")
-        # Store model name and initialize processor only
-        self.model_name = "facebook/wav2vec2-lv-60-espeak-cv-ft"
-        self.processor = Wav2Vec2Processor.from_pretrained(self.model_name)
-        self.target_sample_rate = 16_000
-        self.enhancer = PhoneticEnhancer()
-    def _get_optimal_device(self):
-        if torch.cuda.is_available():
-            return "cuda"
-        elif torch.backends.mps.is_available() and platform.system() == 'Darwin':
-            return "mps"
-        return "cpu"
-    def preprocess_audio(self, audio):
-        """Preprocess audio data for model input."""
-        if isinstance(audio, tuple):
-            sample_rate, audio_data = audio
-        else:
-            return None
-        if audio_data.dtype != np.float32:
-            audio_data = audio_data.astype(np.float32)
-        if audio_data.max() > 1.0 or audio_data.min() < -1.0:
-            audio_data = audio_data / 32768.0
-        if len(audio_data.shape) > 1:
-            audio_data = audio_data.mean(axis=1)
-        if sample_rate != self.target_sample_rate:
-            audio_data = librosa.resample(
-                y=audio_data,
-                orig_sr=sample_rate,
-                target_sr=self.target_sample_rate
-            )
-        return audio_data
-    @spaces.GPU
-    def transcribe_to_phonemes(self, audio, enhancements):
-        """Transcribe audio to phonemes with enhancements."""
-        try:
-            audio_data = self.preprocess_audio(audio)
-            if audio_data is None:
-                return "Please provide valid audio input"
-            # Load model inside GPU context
-            model = Wav2Vec2ForCTC.from_pretrained(self.model_name).to(self.device)
-            model.eval()
-            selected_enhancements = enhancements.split(',') if enhancements else []
-            inputs = self.processor(
-                audio_data,
-                sampling_rate=self.target_sample_rate,
-                return_tensors="pt",
-                padding=True
-            ).input_values.to(self.device)
-            with torch.no_grad():
-                logits = model(inputs).logits
-            predicted_ids = torch.argmax(logits, dim=-1)
-            transcription = self.processor.batch_decode(predicted_ids)[0]
-            enhanced = self.enhancer.enhance_transcription(
-                transcription,
-                selected_enhancements
-            )
-            # Clean up to free GPU memory
-            del model
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-            return f"""Raw IPA: {transcription}
 Enhanced IPA: {enhanced}
 Applied enhancements: {', '.join(selected_enhancements) or 'none'}"""
-        except Exception as e:
-            import traceback
-            return f"Error processing audio: {str(e)}\n{traceback.format_exc()}"
-if __name__ == "__main__":
-    multiprocessing.freeze_support()
-    transcriber = PhonemeTranscriber()
-    iface = gr.Interface(
-        fn=transcriber.transcribe_to_phonemes,
-        inputs=[
-            gr.Audio(sources=["microphone", "upload"], type="numpy"),
-            gr.Textbox(
-                label="Enhancements (comma-separated)",
-                value="length,quality,stress,diphthongs",
-                placeholder="e.g., length,quality,stress,diphthongs"
-            )
-        ],
-        outputs="text",
-        title="Speech to Phoneme Converter - Enhanced IPA",
-        description=f"""Convert speech to phonemes with customizable IPA enhancements.
-                       Currently using device: {transcriber.device}
-                       Available enhancements:
-                       - length: Add vowel length markers (ː)
-                       - quality: Adjust vowel quality (e.g., ə → æ)
-                       - stress: Add stress marks (ˈ)
-                       - diphthongs: Combine vowels into diphthongs (e.g., ei → eɪ)
-                       """
-    )
-    iface.launch()

 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 import platform
 import librosa
+processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
+model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
+model.to('cuda')
 class PhoneticEnhancer:
     def __init__(self):
         vowels = set('aeiouɑɐəæɛɪʊʌɔ')
         return any(char in vowels for char in phoneme)
+    def _split_into_syllables(self, phonemes: list) -> list:
         syllables = []
         current_syllable = []
         return syllables
+    def enhance_transcription(self, raw_phonemes: str, enhancements: list = None) -> str:
         if enhancements is None:
             enhancements = ['length', 'quality', 'stress', 'diphthongs']
         return ' '.join(enhanced_phonemes)
+def preprocess_audio(audio):
+    """Preprocess audio data for model input."""
+    if isinstance(audio, tuple):
+        sample_rate, audio_data = audio
+    else:
+        return None
+    if audio_data.dtype != np.float32:
+        audio_data = audio_data.astype(np.float32)
+    if audio_data.max() > 1.0 or audio_data.min() < -1.0:
+        audio_data = audio_data / 32768.0
+    if len(audio_data.shape) > 1:
+        audio_data = audio_data.mean(axis=1)
+    if sample_rate != 16000:
+        audio_data = librosa.resample(
+            y=audio_data,
+            orig_sr=sample_rate,
+            target_sr=16000
+        )
+    return audio_data
+@spaces.GPU
+def transcribe_to_phonemes(audio, enhancements):
+    """Transcribe audio to phonemes with enhancements."""
+    try:
+        audio_data = preprocess_audio(audio)
+        if audio_data is None:
+            return "Please provide valid audio input"
+        selected_enhancements = enhancements.split(',') if enhancements else []
+        inputs = processor(
+            audio_data,
+            sampling_rate=16000,
+            return_tensors="pt",
+            padding=True
+        ).input_values.to('cuda')
+        with torch.no_grad():
+            logits = model(inputs).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.batch_decode(predicted_ids)[0]
+        enhancer = PhoneticEnhancer()
+        enhanced = enhancer.enhance_transcription(
+            transcription,
+            selected_enhancements
+        )
+        return f"""Raw IPA: {transcription}
 Enhanced IPA: {enhanced}
 Applied enhancements: {', '.join(selected_enhancements) or 'none'}"""
+    except Exception as e:
+        import traceback
+        return f"Error processing audio: {str(e)}\n{traceback.format_exc()}"
+iface = gr.Interface(
+    fn=transcribe_to_phonemes,
+    inputs=[
+        gr.Audio(sources=["microphone", "upload"], type="numpy"),
+        gr.Textbox(
+            label="Enhancements (comma-separated)",
+            value="length,quality,stress,diphthongs",
+            placeholder="e.g., length,quality,stress,diphthongs"
+        )
+    ],
+    outputs="text",
+    title="Speech to Phoneme Converter - Enhanced IPA",
+    description="""Convert speech to phonemes with customizable IPA enhancements.
+                   Available enhancements:
+                   - length: Add vowel length markers (ː)
+                   - quality: Adjust vowel quality (e.g., ə → æ)
+                   - stress: Add stress marks (ˈ)
+                   - diphthongs: Combine vowels into diphthongs (e.g., ei → eɪ)
+                   Example: "piaʒe" → "piːˈæʒeɪ"
+                   """
+)
+iface.launch()