Spaces:

abhishekrajpurohit
/

generate_local_lan

Runtime error

App Files Files Community

abhishekrajpurohit commited on 8 days ago

Commit

f34acd1

verified ·

1 Parent(s): 195bb33

Upload 9 files

Browse files

Files changed (9) hide show

config/__pycache__/language_mapping.cpython-312.pyc +0 -0
config/language_mapping.py +154 -0
models/__pycache__/tts.cpython-312.pyc +0 -0
models/tts.py +50 -0
utils/.DS_Store +0 -0
utils/__pycache__/audio_utils.cpython-312.pyc +0 -0
utils/__pycache__/input_validation.cpython-312.pyc +0 -0
utils/audio_utils.py +31 -0
utils/input_validation.py +7 -0

config/__pycache__/language_mapping.cpython-312.pyc ADDED Viewed

Binary file (5.62 kB). View file

config/language_mapping.py ADDED Viewed

	@@ -0,0 +1,154 @@

+LANGUAGE_VOICE_MAPPING = {
+    "Assamese": ["Amit", "Sita"],
+    "Bengali": ["Arjun", "Aditi"],
+    "Bodo": ["Bikram", "Maya"],
+    "Chhattisgarhi": ["Bhanu", "Champa"],
+    "Dogri": ["Karan"],
+    "English": ["Thoma", "Mary"],
+    "Gujarati": ["Yash", "Neha"],
+    "Hindi": ["Rohit", "Divya"],
+    "Kannada": ["Suresh", "Anu"],
+    "Malayalam": ["Anjali", "Harish"],
+    "Manipuri": ["Laishram", "Ranjit"],
+    "Marathi": ["Sanjay", "Sunita"],
+    "Nepali": ["Amrita"],
+    "Odia": ["Manas", "Debjani"],
+    "Punjabi": ["Divjot", "Gurpreet"],
+    "Sanskrit": ["Aryan"],
+    "Tamil": ["Jaya", "Kavitha"],
+    "Telugu": ["Prakash", "Lalitha"]
+}
+# Voice characteristics for each speaker
+VOICE_CHARACTERISTICS = {
+    "Amit": "slightly deep and resonant",
+    "Sita": "clear and well-paced",
+    "Arjun": "moderate and clear",
+    "Aditi": "high-pitched and expressive",
+    "Bikram": "higher-pitched and energetic",
+    "Maya": "balanced and pleasant",
+    "Bhanu": "warm and measured",
+    "Champa": "clear and gentle",
+    "Karan": "high-pitched and engaging",
+    "Thoma": "clear and well-articulated",
+    "Mary": "pleasant and measured",
+    "Yash": "warm and balanced",
+    "Neha": "clear and dynamic",
+    "Rohit": "moderate and expressive",
+    "Divya": "pleasant and well-paced",
+    "Suresh": "clear and precise",
+    "Anu": "warm and melodious",
+    "Anjali": "high-pitched and pleasant",
+    "Harish": "deep and measured",
+    "Laishram": "balanced and smooth",
+    "Ranjit": "clear and authoritative",
+    "Sanjay": "deep and authoritative",
+    "Sunita": "high-pitched and pleasant",
+    "Amrita": "high-pitched and gentle",
+    "Manas": "moderate and measured",
+    "Debjani": "clear and pleasant",
+    "Divjot": "clear and dynamic",
+    "Gurpreet": "warm and balanced",
+    "Aryan": "resonant and measured",
+    "Jaya": "high-pitched and melodious",
+    "Kavitha": "clear and expressive",
+    "Prakash": "clear and well-paced",
+    "Lalitha": "pleasant and melodious"
+}
+# Emotion descriptions
+EMOTION_DESC = {
+    "Neutral": "maintaining a balanced and natural tone",
+    "Happy": "with a warm and positive energy",
+    "Sad": "with a gentle and somber tone",
+    "Angry": "with intense and strong delivery",
+    "Highly Expressive": "with dynamic and vibrant emotional delivery",
+    "Monotone": "with minimal tonal variation"
+}
+# Speed descriptions
+SPEED_DESC = {
+    "Very Slow": "at an extremely measured pace",
+    "Slow": "at a measured, deliberate pace",
+    "Normal": "at a natural, comfortable pace",
+    "Fast": "at a swift, dynamic pace",
+    "Very Fast": "at a rapid, accelerated pace"
+}
+# Pitch modifiers
+PITCH_DESC = {
+    "Very Low": "in an extremely deep register",
+    "Low": "in a deeper register",
+    "Medium": "in a natural pitch range",
+    "High": "in a higher register",
+    "Very High": "in an extremely high register"
+}
+BACKGROUND_NOISE_DESC = {
+    "None": "with absolutely no background noise",
+    "Minimal": "with minimal background noise",
+    "Moderate": "with moderate ambient noise",
+    "Noticeable": "with noticeable background sounds"
+}
+REVERBERATION_DESC = {
+    "Very Close": "in an extremely intimate setting",
+    "Close": "in a close-sounding environment",
+    "Moderate": "in a moderately spacious environment",
+    "Distant": "in a spacious, reverberant setting",
+    "Very Distant": "in a very large, echoing space"
+}
+QUALITY_DESC = {
+    "Basic": "in basic audio quality",
+    "Good": "in good audio quality",
+    "High": "in high audio quality",
+    "Studio": "in professional studio quality"
+}
+def construct_description(
+    speaker,
+    language,
+    emotion="Neutral",
+    speed="Normal",
+    pitch="Medium",
+    background_noise="Minimal",
+    reverberation="Close",
+    quality="High"
+):
+    """
+    Constructs a comprehensive description for the TTS model based on all available parameters.
+    Args:
+        speaker (str): The name of the speaker
+        language (str): The language being spoken
+        emotion (str): The emotional tone
+        speed (str): The speaking speed
+        pitch (str): The pitch level
+        background_noise (str): Level of background noise
+        reverberation (str): Distance/space effect
+        quality (str): Audio quality level
+    Returns:
+        str: A detailed description for the TTS model
+    """
+    description = (
+        f"{speaker} speaks in {language} {VOICE_CHARACTERISTICS.get(speaker, 'with clear articulation')} "
+        f"{PITCH_DESC[pitch]}, {EMOTION_DESC[emotion]} {SPEED_DESC[speed]}. "
+        f"The recording is {REVERBERATION_DESC[reverberation]}, {BACKGROUND_NOISE_DESC[background_noise]}, "
+        f"captured {QUALITY_DESC[quality]}."
+    )
+    return description
+def get_speakers_for_language(language):
+    """
+    Get the list of recommended speakers for a given language.
+    Args:
+        language (str): The language to get speakers for
+    Returns:
+        list: List of recommended speakers for the language
+    """
+    return LANGUAGE_VOICE_MAPPING.get(language, [])

models/__pycache__/tts.cpython-312.pyc ADDED Viewed

Binary file (2.86 kB). View file

models/tts.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import torch
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoTokenizer
+import soundfile as sf
+class TTSModel:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model_name = "ai4bharat/indic-parler-tts"
+        # Print cache directory and model files
+        print(f"Loading model on device: {self.device}")
+        # Initialize model and tokenizers exactly as in the documentation
+        self.model = ParlerTTSForConditionalGeneration.from_pretrained(self.model_name).to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+        self.description_tokenizer = AutoTokenizer.from_pretrained(self.model.config.text_encoder._name_or_path)
+        print("Model loaded successfully")
+    def generate_audio(self, text, description):
+        try:
+            # Tokenize exactly as shown in the documentation
+            description_inputs = self.description_tokenizer(
+                description,
+                return_tensors="pt"
+            ).to(self.device)
+            prompt_inputs = self.tokenizer(
+                text,
+                return_tensors="pt"
+            ).to(self.device)
+            # Generate audio
+            with torch.no_grad():
+                generation = self.model.generate(
+                    input_ids=description_inputs.input_ids,
+                    attention_mask=description_inputs.attention_mask,
+                    prompt_input_ids=prompt_inputs.input_ids,
+                    prompt_attention_mask=prompt_inputs.attention_mask
+                )
+                # Convert to numpy array
+                audio_array = generation.cpu().numpy().squeeze()
+            return audio_array
+        except Exception as e:
+            print(f"Error in speech generation: {str(e)}")
+            raise

utils/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

utils/__pycache__/audio_utils.cpython-312.pyc ADDED Viewed

Binary file (1.84 kB). View file

utils/__pycache__/input_validation.cpython-312.pyc ADDED Viewed

Binary file (606 Bytes). View file

utils/audio_utils.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+import soundfile as sf
+import hashlib
+def ensure_dir(directory):
+    """Ensure that a directory exists"""
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+def get_audio_filename(text, language, speaker, emotion, speed, pitch, background_noise, reverberation, quality):
+    """Generate a unique filename based on input parameters"""
+    # Create a string containing all parameters
+    params = f"{text}{language}{speaker}{emotion}{speed}{pitch}{background_noise}{reverberation}{quality}"
+    # Create a hash of the parameters
+    filename = hashlib.md5(params.encode()).hexdigest()
+    return filename
+def save_audio(audio_array, filename, sampling_rate=22050):
+    """Save audio array to a file"""
+    ensure_dir("static/audio")
+    filepath = f"static/audio/{filename}.wav"
+    sf.write(filepath, audio_array, sampling_rate)
+    return filepath
+def get_cached_audio(text, language, speaker, emotion, speed, pitch, background_noise, reverberation, quality):
+    """Get cached audio if it exists"""
+    filename = get_audio_filename(text, language, speaker, emotion, speed, pitch, background_noise, reverberation, quality)
+    filepath = f"static/audio/{filename}.wav"
+    if os.path.exists(filepath):
+        return filepath
+    return None

utils/input_validation.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from config.language_mapping import LANGUAGE_VOICE_MAPPING
+def validate_input(text, language):
+    if not text.strip():
+        raise ValueError("Input text cannot be empty.")
+    if language not in LANGUAGE_VOICE_MAPPING:
+        raise ValueError(f"Language {language} is not supported.")