Shashwat2528 commited on
Commit
785f4dd
·
1 Parent(s): 473ac33

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +214 -0
app.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import sounddevice as sd
2
+ # import soundfile as sf
3
+ # import speech_recognition as sr
4
+ # from gtts import gTTS
5
+ # import pygame
6
+ # import time
7
+ # import gradio as gr
8
+
9
+ # from transformers import AutoTokenizer, AutoModelForQuestionAnswering
10
+
11
+ # model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
12
+ # tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
13
+
14
+ # class AvishkaaramEkta:
15
+ # def __init__(self, model):
16
+ # self.model = model
17
+ # self.tokenizer = tokenizer
18
+
19
+ # def text_to_speech(self, text, output_file):
20
+ # # Create a gTTS object with the text and desired language
21
+ # tts = gTTS(text=text, lang='en')
22
+
23
+ # # Save the audio to a file
24
+ # tts.save(output_file)
25
+
26
+ # def play_mp3(self, file_path):
27
+ # pygame.mixer.init()
28
+ # pygame.mixer.music.load(file_path)
29
+ # pygame.mixer.music.play()
30
+ # while pygame.mixer.music.get_busy():
31
+ # continue
32
+
33
+ # def ask_question(self, audio_file):
34
+ # print("Recording audio...")
35
+ # audio = sd.rec(int(44100 * 6), samplerate=44100, channels=1)
36
+ # sd.wait()
37
+
38
+ # # Save the audio to a file
39
+ # sf.write(audio_file, audio, 44100)
40
+
41
+ # print(f"Audio saved to {audio_file}")
42
+ # r = sr.Recognizer()
43
+
44
+ # with sr.AudioFile(audio_file) as source:
45
+ # audio_data = r.record(source)
46
+
47
+ # text = ""
48
+
49
+ # try:
50
+ # text = r.recognize_google(audio_data)
51
+ # print("Transcription:", text)
52
+ # except sr.UnknownValueError:
53
+ # print("Speech recognition could not understand audio")
54
+ # except sr.RequestError as e:
55
+ # print("Could not request results from Google Speech Recognition service; {0}".format(e))
56
+
57
+ # return text
58
+
59
+ # def answer_question(self, passage, question):
60
+ # inputs = self.tokenizer(passage, question, return_tensors="pt")
61
+ # outputs = self.model(**inputs)
62
+ # start_logits = outputs.start_logits
63
+ # end_logits = outputs.end_logits
64
+ # start_index = start_logits.argmax(dim=1).item()
65
+ # end_index = end_logits.argmax(dim=1).item()
66
+ # tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
67
+ # answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1])
68
+ # return answer
69
+
70
+ # def question_answer(self, passage, question):
71
+ # passage_audio_file = "passage.mp3"
72
+ # question_audio_file = "question.wav"
73
+ # answer_audio_file = "answer.mp3"
74
+
75
+ # self.text_to_speech(passage, passage_audio_file)
76
+ # self.play_mp3(passage_audio_file)
77
+
78
+ # question_text = self.ask_question(question_audio_file)
79
+ # answer = self.answer_question(passage, question_text)
80
+
81
+ # self.text_to_speech("The answer to the question is: " + answer, answer_audio_file)
82
+ # self.play_mp3(answer_audio_file)
83
+
84
+ # time.sleep(5) # Wait for 5 seconds before ending
85
+
86
+ # return answer
87
+
88
+ # # Create an instance of the AvishkaaramEkta class
89
+ # avishkaaram_ekta = AvishkaaramEkta(model)
90
+
91
+ # # Define the Gradio interface
92
+ # iface = gr.Interface(
93
+ # fn=avishkaaram_ekta.question_answer,
94
+ # inputs=["text", "text"],
95
+ # outputs="text",
96
+ # title="Audio Question Answering",
97
+ # description="Ask a question about a given passage using audio input",
98
+ # examples=[
99
+ # ["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania to study chimpanzees.", "What did Dr. Jane Goodall study?"],
100
+ # ["The Taj Mahal is located in Agra, India.", "Where is the Taj Mahal situated?"],
101
+ # ],
102
+ # interpretation="default",
103
+ # )
104
+
105
+ # # Launch the Gradio interface
106
+ # iface.launch()
107
+
108
+
109
+ import torch
110
+ import torchaudio
111
+ import soundfile as sf
112
+ import speech_recognition as sr
113
+ from gtts import gTTS
114
+ import pygame
115
+ import time
116
+ import gradio as gr
117
+
118
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering
119
+
120
+ model = AutoModelForQuestionAnswering.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
121
+ tokenizer = AutoTokenizer.from_pretrained('AVISHKAARAM/avishkaarak-ekta-hindi')
122
+
123
+ class AvishkaaramEkta:
124
+ def __init__(self, model):
125
+ self.model = model
126
+ self.tokenizer = tokenizer
127
+
128
+ def text_to_speech(self, text, output_file):
129
+ # Create a gTTS object with the text and desired language
130
+ tts = gTTS(text=text, lang='en')
131
+
132
+ # Save the audio to a file
133
+ tts.save(output_file)
134
+
135
+ def play_mp3(self, file_path):
136
+ pygame.mixer.init()
137
+ pygame.mixer.music.load(file_path)
138
+ pygame.mixer.music.play()
139
+ while pygame.mixer.music.get_busy():
140
+ continue
141
+
142
+ def ask_question(self, audio_file):
143
+ print("Recording audio...")
144
+ waveform, sample_rate = torchaudio.rec(6, sr=44100, channels=1)
145
+
146
+ # Save the audio to a file
147
+ sf.write(audio_file, waveform.squeeze().numpy(), sample_rate)
148
+
149
+ print(f"Audio saved to {audio_file}")
150
+ r = sr.Recognizer()
151
+
152
+ with sr.AudioFile(audio_file) as source:
153
+ audio_data = r.record(source)
154
+
155
+ text = ""
156
+
157
+ try:
158
+ text = r.recognize_google(audio_data)
159
+ print("Transcription:", text)
160
+ except sr.UnknownValueError:
161
+ print("Speech recognition could not understand audio")
162
+ except sr.RequestError as e:
163
+ print("Could not request results from Google Speech Recognition service; {0}".format(e))
164
+
165
+ return text
166
+
167
+ def answer_question(self, passage, question):
168
+ inputs = self.tokenizer(passage, question, return_tensors="pt")
169
+ outputs = self.model(**inputs)
170
+ start_logits = outputs.start_logits
171
+ end_logits = outputs.end_logits
172
+ start_index = start_logits.argmax(dim=1).item()
173
+ end_index = end_logits.argmax(dim=1).item()
174
+ tokens = self.tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
175
+ answer = self.tokenizer.convert_tokens_to_string(tokens[start_index:end_index+1])
176
+ return answer
177
+
178
+ def question_answer(self, passage, question):
179
+ passage_audio_file = "passage.mp3"
180
+ question_audio_file = "question.wav"
181
+ answer_audio_file = "answer.mp3"
182
+
183
+ self.text_to_speech(passage, passage_audio_file)
184
+ self.play_mp3(passage_audio_file)
185
+
186
+ question_text = self.ask_question(question_audio_file)
187
+ answer = self.answer_question(passage, question_text)
188
+
189
+ self.text_to_speech("The answer to the question is: " + answer, answer_audio_file)
190
+ self.play_mp3(answer_audio_file)
191
+
192
+ time.sleep(5) # Wait for 5 seconds before ending
193
+
194
+ return answer
195
+
196
+ # Create an instance of the AvishkaaramEkta class
197
+ avishkaaram_ekta = AvishkaaramEkta(model)
198
+
199
+ # Define the Gradio interface
200
+ iface = gr.Interface(
201
+ fn=avishkaaram_ekta.question_answer,
202
+ inputs=["text", "text"],
203
+ outputs="text",
204
+ title="Audio Question Answering",
205
+ description="Ask a question about a given passage using audio input",
206
+ examples=[
207
+ ["In 1960, Dr. Jane Goodall arrived in Gombe, Tanzania to study chimpanzees.", "What did Dr. Jane Goodall study?"],
208
+ ["The Taj Mahal is located in Agra, India.", "Where is the Taj Mahal situated?"],
209
+ ],
210
+ interpretation="default",
211
+ )
212
+
213
+ # Launch the Gradio interface
214
+ iface.launch()