Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import torch | |
import numpy as np | |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC | |
import platform | |
import librosa | |
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft") | |
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft") | |
model.to('cuda') | |
class PhoneticEnhancer: | |
def __init__(self): | |
# Vowel length rules | |
self.long_vowels = { | |
'i': 'iː', | |
'u': 'uː', | |
'a': 'ɑː', | |
'ɑ': 'ɑː', | |
'e': 'eː', | |
'o': 'oː' | |
} | |
# Common diphthongs | |
self.diphthongs = { | |
'ei': 'eɪ', | |
'ai': 'aɪ', | |
'oi': 'ɔɪ', | |
'ou': 'əʊ', | |
'au': 'aʊ' | |
} | |
# Vowel quality adjustments | |
self.vowel_quality = { | |
'ə': 'æ', # In stressed positions | |
'ɐ': 'æ' # Common substitution | |
} | |
# Stress pattern rules | |
self.stress_patterns = [ | |
# (pattern, position) - position is index from start | |
(['CV', 'CV'], 1), # For words like "piage" | |
(['CVV', 'CV'], 0), # For words with long first vowel | |
] | |
def _is_vowel(self, phoneme: str) -> bool: | |
vowels = set('aeiouɑɐəæɛɪʊʌɔ') | |
return any(char in vowels for char in phoneme) | |
def _split_into_syllables(self, phonemes: list) -> list: | |
syllables = [] | |
current_syllable = [] | |
for phoneme in phonemes: | |
current_syllable.append(phoneme) | |
if self._is_vowel(phoneme) and len(current_syllable) > 0: | |
syllables.append(current_syllable) | |
current_syllable = [] | |
if current_syllable: | |
if len(syllables) > 0: | |
syllables[-1].extend(current_syllable) | |
else: | |
syllables.append(current_syllable) | |
return syllables | |
def enhance_transcription(self, raw_phonemes: str, enhancements: list = None) -> str: | |
if enhancements is None: | |
enhancements = ['length', 'quality', 'stress', 'diphthongs'] | |
# Split into individual phonemes | |
phonemes = raw_phonemes.split() | |
enhanced_phonemes = phonemes.copy() | |
if 'length' in enhancements: | |
# Apply vowel length rules | |
for i, phoneme in enumerate(enhanced_phonemes): | |
if phoneme in self.long_vowels: | |
enhanced_phonemes[i] = self.long_vowels[phoneme] | |
if 'quality' in enhancements: | |
# Apply vowel quality adjustments | |
for i, phoneme in enumerate(enhanced_phonemes): | |
if phoneme in self.vowel_quality: | |
enhanced_phonemes[i] = self.vowel_quality[phoneme] | |
if 'diphthongs' in enhancements: | |
# Apply diphthong rules | |
i = 0 | |
while i < len(enhanced_phonemes) - 1: | |
pair = enhanced_phonemes[i] + enhanced_phonemes[i + 1] | |
if pair in self.diphthongs: | |
enhanced_phonemes[i] = self.diphthongs[pair] | |
enhanced_phonemes.pop(i + 1) | |
i += 1 | |
if 'stress' in enhancements: | |
# Add stress marks based on syllable structure | |
syllables = self._split_into_syllables(enhanced_phonemes) | |
if len(syllables) > 1: | |
# Add stress to the syllable containing 'æ' if present | |
for i, syll in enumerate(syllables): | |
if any('æ' in p for p in syll): | |
syllables[i].insert(0, 'ˈ') | |
break | |
# If no 'æ', add stress to first syllable by default | |
else: | |
syllables[0].insert(0, 'ˈ') | |
# Flatten syllables back to phonemes | |
enhanced_phonemes = [p for syll in syllables for p in syll] | |
return ' '.join(enhanced_phonemes) | |
def preprocess_audio(audio): | |
"""Preprocess audio data for model input.""" | |
if isinstance(audio, tuple): | |
sample_rate, audio_data = audio | |
else: | |
return None | |
if audio_data.dtype != np.float32: | |
audio_data = audio_data.astype(np.float32) | |
if audio_data.max() > 1.0 or audio_data.min() < -1.0: | |
audio_data = audio_data / 32768.0 | |
if len(audio_data.shape) > 1: | |
audio_data = audio_data.mean(axis=1) | |
if sample_rate != 16000: | |
audio_data = librosa.resample( | |
y=audio_data, | |
orig_sr=sample_rate, | |
target_sr=16000 | |
) | |
return audio_data | |
def transcribe_to_phonemes(audio, enhancements): | |
"""Transcribe audio to phonemes with enhancements.""" | |
try: | |
audio_data = preprocess_audio(audio) | |
if audio_data is None: | |
return "Please provide valid audio input" | |
selected_enhancements = enhancements.split(',') if enhancements else [] | |
inputs = processor( | |
audio_data, | |
sampling_rate=16000, | |
return_tensors="pt", | |
padding=True | |
).input_values.to('cuda') | |
with torch.no_grad(): | |
logits = model(inputs).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.batch_decode(predicted_ids)[0] | |
enhancer = PhoneticEnhancer() | |
enhanced = enhancer.enhance_transcription( | |
transcription, | |
selected_enhancements | |
) | |
return f"""Raw IPA: {transcription} | |
Enhanced IPA: {enhanced} | |
Applied enhancements: {', '.join(selected_enhancements) or 'none'}""" | |
except Exception as e: | |
import traceback | |
return f"Error processing audio: {str(e)}\n{traceback.format_exc()}" | |
iface = gr.Interface( | |
fn=transcribe_to_phonemes, | |
inputs=[ | |
gr.Audio(sources=["microphone", "upload"], type="numpy"), | |
gr.Textbox( | |
label="Enhancements (comma-separated)", | |
value="length,quality,stress,diphthongs", | |
placeholder="e.g., length,quality,stress,diphthongs" | |
) | |
], | |
outputs="text", | |
title="Speech to Phoneme Converter - Enhanced IPA", | |
description="""Convert speech to phonemes with customizable IPA enhancements. | |
Available enhancements: | |
- length: Add vowel length markers (ː) | |
- quality: Adjust vowel quality (e.g., ə → æ) | |
- stress: Add stress marks (ˈ) | |
- diphthongs: Combine vowels into diphthongs (e.g., ei → eɪ) | |
""" | |
) | |
iface.launch() |