abhishekrajpurohit commited on
Commit
f34acd1
·
verified ·
1 Parent(s): 195bb33

Upload 9 files

Browse files
config/__pycache__/language_mapping.cpython-312.pyc ADDED
Binary file (5.62 kB). View file
 
config/language_mapping.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LANGUAGE_VOICE_MAPPING = {
2
+ "Assamese": ["Amit", "Sita"],
3
+ "Bengali": ["Arjun", "Aditi"],
4
+ "Bodo": ["Bikram", "Maya"],
5
+ "Chhattisgarhi": ["Bhanu", "Champa"],
6
+ "Dogri": ["Karan"],
7
+ "English": ["Thoma", "Mary"],
8
+ "Gujarati": ["Yash", "Neha"],
9
+ "Hindi": ["Rohit", "Divya"],
10
+ "Kannada": ["Suresh", "Anu"],
11
+ "Malayalam": ["Anjali", "Harish"],
12
+ "Manipuri": ["Laishram", "Ranjit"],
13
+ "Marathi": ["Sanjay", "Sunita"],
14
+ "Nepali": ["Amrita"],
15
+ "Odia": ["Manas", "Debjani"],
16
+ "Punjabi": ["Divjot", "Gurpreet"],
17
+ "Sanskrit": ["Aryan"],
18
+ "Tamil": ["Jaya", "Kavitha"],
19
+ "Telugu": ["Prakash", "Lalitha"]
20
+ }
21
+
22
+ # Voice characteristics for each speaker
23
+ VOICE_CHARACTERISTICS = {
24
+ "Amit": "slightly deep and resonant",
25
+ "Sita": "clear and well-paced",
26
+ "Arjun": "moderate and clear",
27
+ "Aditi": "high-pitched and expressive",
28
+ "Bikram": "higher-pitched and energetic",
29
+ "Maya": "balanced and pleasant",
30
+ "Bhanu": "warm and measured",
31
+ "Champa": "clear and gentle",
32
+ "Karan": "high-pitched and engaging",
33
+ "Thoma": "clear and well-articulated",
34
+ "Mary": "pleasant and measured",
35
+ "Yash": "warm and balanced",
36
+ "Neha": "clear and dynamic",
37
+ "Rohit": "moderate and expressive",
38
+ "Divya": "pleasant and well-paced",
39
+ "Suresh": "clear and precise",
40
+ "Anu": "warm and melodious",
41
+ "Anjali": "high-pitched and pleasant",
42
+ "Harish": "deep and measured",
43
+ "Laishram": "balanced and smooth",
44
+ "Ranjit": "clear and authoritative",
45
+ "Sanjay": "deep and authoritative",
46
+ "Sunita": "high-pitched and pleasant",
47
+ "Amrita": "high-pitched and gentle",
48
+ "Manas": "moderate and measured",
49
+ "Debjani": "clear and pleasant",
50
+ "Divjot": "clear and dynamic",
51
+ "Gurpreet": "warm and balanced",
52
+ "Aryan": "resonant and measured",
53
+ "Jaya": "high-pitched and melodious",
54
+ "Kavitha": "clear and expressive",
55
+ "Prakash": "clear and well-paced",
56
+ "Lalitha": "pleasant and melodious"
57
+ }
58
+
59
+ # Emotion descriptions
60
+ EMOTION_DESC = {
61
+ "Neutral": "maintaining a balanced and natural tone",
62
+ "Happy": "with a warm and positive energy",
63
+ "Sad": "with a gentle and somber tone",
64
+ "Angry": "with intense and strong delivery",
65
+ "Highly Expressive": "with dynamic and vibrant emotional delivery",
66
+ "Monotone": "with minimal tonal variation"
67
+ }
68
+
69
+ # Speed descriptions
70
+ SPEED_DESC = {
71
+ "Very Slow": "at an extremely measured pace",
72
+ "Slow": "at a measured, deliberate pace",
73
+ "Normal": "at a natural, comfortable pace",
74
+ "Fast": "at a swift, dynamic pace",
75
+ "Very Fast": "at a rapid, accelerated pace"
76
+ }
77
+
78
+ # Pitch modifiers
79
+ PITCH_DESC = {
80
+ "Very Low": "in an extremely deep register",
81
+ "Low": "in a deeper register",
82
+ "Medium": "in a natural pitch range",
83
+ "High": "in a higher register",
84
+ "Very High": "in an extremely high register"
85
+ }
86
+
87
+ BACKGROUND_NOISE_DESC = {
88
+ "None": "with absolutely no background noise",
89
+ "Minimal": "with minimal background noise",
90
+ "Moderate": "with moderate ambient noise",
91
+ "Noticeable": "with noticeable background sounds"
92
+ }
93
+
94
+ REVERBERATION_DESC = {
95
+ "Very Close": "in an extremely intimate setting",
96
+ "Close": "in a close-sounding environment",
97
+ "Moderate": "in a moderately spacious environment",
98
+ "Distant": "in a spacious, reverberant setting",
99
+ "Very Distant": "in a very large, echoing space"
100
+ }
101
+
102
+ QUALITY_DESC = {
103
+ "Basic": "in basic audio quality",
104
+ "Good": "in good audio quality",
105
+ "High": "in high audio quality",
106
+ "Studio": "in professional studio quality"
107
+ }
108
+
109
+ def construct_description(
110
+ speaker,
111
+ language,
112
+ emotion="Neutral",
113
+ speed="Normal",
114
+ pitch="Medium",
115
+ background_noise="Minimal",
116
+ reverberation="Close",
117
+ quality="High"
118
+ ):
119
+ """
120
+ Constructs a comprehensive description for the TTS model based on all available parameters.
121
+
122
+ Args:
123
+ speaker (str): The name of the speaker
124
+ language (str): The language being spoken
125
+ emotion (str): The emotional tone
126
+ speed (str): The speaking speed
127
+ pitch (str): The pitch level
128
+ background_noise (str): Level of background noise
129
+ reverberation (str): Distance/space effect
130
+ quality (str): Audio quality level
131
+
132
+ Returns:
133
+ str: A detailed description for the TTS model
134
+ """
135
+ description = (
136
+ f"{speaker} speaks in {language} {VOICE_CHARACTERISTICS.get(speaker, 'with clear articulation')} "
137
+ f"{PITCH_DESC[pitch]}, {EMOTION_DESC[emotion]} {SPEED_DESC[speed]}. "
138
+ f"The recording is {REVERBERATION_DESC[reverberation]}, {BACKGROUND_NOISE_DESC[background_noise]}, "
139
+ f"captured {QUALITY_DESC[quality]}."
140
+ )
141
+
142
+ return description
143
+
144
+ def get_speakers_for_language(language):
145
+ """
146
+ Get the list of recommended speakers for a given language.
147
+
148
+ Args:
149
+ language (str): The language to get speakers for
150
+
151
+ Returns:
152
+ list: List of recommended speakers for the language
153
+ """
154
+ return LANGUAGE_VOICE_MAPPING.get(language, [])
models/__pycache__/tts.cpython-312.pyc ADDED
Binary file (2.86 kB). View file
 
models/tts.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from parler_tts import ParlerTTSForConditionalGeneration
3
+ from transformers import AutoTokenizer
4
+ import soundfile as sf
5
+
6
+ class TTSModel:
7
+ def __init__(self):
8
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
9
+ self.model_name = "ai4bharat/indic-parler-tts"
10
+
11
+ # Print cache directory and model files
12
+ print(f"Loading model on device: {self.device}")
13
+
14
+ # Initialize model and tokenizers exactly as in the documentation
15
+ self.model = ParlerTTSForConditionalGeneration.from_pretrained(self.model_name).to(self.device)
16
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
17
+ self.description_tokenizer = AutoTokenizer.from_pretrained(self.model.config.text_encoder._name_or_path)
18
+
19
+ print("Model loaded successfully")
20
+
21
+ def generate_audio(self, text, description):
22
+ try:
23
+ # Tokenize exactly as shown in the documentation
24
+ description_inputs = self.description_tokenizer(
25
+ description,
26
+ return_tensors="pt"
27
+ ).to(self.device)
28
+
29
+ prompt_inputs = self.tokenizer(
30
+ text,
31
+ return_tensors="pt"
32
+ ).to(self.device)
33
+
34
+ # Generate audio
35
+ with torch.no_grad():
36
+ generation = self.model.generate(
37
+ input_ids=description_inputs.input_ids,
38
+ attention_mask=description_inputs.attention_mask,
39
+ prompt_input_ids=prompt_inputs.input_ids,
40
+ prompt_attention_mask=prompt_inputs.attention_mask
41
+ )
42
+
43
+ # Convert to numpy array
44
+ audio_array = generation.cpu().numpy().squeeze()
45
+
46
+ return audio_array
47
+
48
+ except Exception as e:
49
+ print(f"Error in speech generation: {str(e)}")
50
+ raise
utils/.DS_Store ADDED
Binary file (6.15 kB). View file
 
utils/__pycache__/audio_utils.cpython-312.pyc ADDED
Binary file (1.84 kB). View file
 
utils/__pycache__/input_validation.cpython-312.pyc ADDED
Binary file (606 Bytes). View file
 
utils/audio_utils.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import soundfile as sf
3
+ import hashlib
4
+
5
+ def ensure_dir(directory):
6
+ """Ensure that a directory exists"""
7
+ if not os.path.exists(directory):
8
+ os.makedirs(directory)
9
+
10
+ def get_audio_filename(text, language, speaker, emotion, speed, pitch, background_noise, reverberation, quality):
11
+ """Generate a unique filename based on input parameters"""
12
+ # Create a string containing all parameters
13
+ params = f"{text}{language}{speaker}{emotion}{speed}{pitch}{background_noise}{reverberation}{quality}"
14
+ # Create a hash of the parameters
15
+ filename = hashlib.md5(params.encode()).hexdigest()
16
+ return filename
17
+
18
+ def save_audio(audio_array, filename, sampling_rate=22050):
19
+ """Save audio array to a file"""
20
+ ensure_dir("static/audio")
21
+ filepath = f"static/audio/{filename}.wav"
22
+ sf.write(filepath, audio_array, sampling_rate)
23
+ return filepath
24
+
25
+ def get_cached_audio(text, language, speaker, emotion, speed, pitch, background_noise, reverberation, quality):
26
+ """Get cached audio if it exists"""
27
+ filename = get_audio_filename(text, language, speaker, emotion, speed, pitch, background_noise, reverberation, quality)
28
+ filepath = f"static/audio/{filename}.wav"
29
+ if os.path.exists(filepath):
30
+ return filepath
31
+ return None
utils/input_validation.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from config.language_mapping import LANGUAGE_VOICE_MAPPING
2
+
3
+ def validate_input(text, language):
4
+ if not text.strip():
5
+ raise ValueError("Input text cannot be empty.")
6
+ if language not in LANGUAGE_VOICE_MAPPING:
7
+ raise ValueError(f"Language {language} is not supported.")