pratham0011 commited on
Commit
6df7b9a
·
verified ·
1 Parent(s): e48cc2b

Upload voice_chat.py

Browse files
Files changed (1) hide show
  1. voice_chat.py +67 -0
voice_chat.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import edge_tts
3
+ import asyncio
4
+ import tempfile
5
+ import numpy as np
6
+ import soxr
7
+ from pydub import AudioSegment
8
+ import torch
9
+ import sentencepiece as spm
10
+ import onnxruntime as ort
11
+ from huggingface_hub import hf_hub_download, InferenceClient
12
+ from dotenv import load_dotenv
13
+
14
+ # Speech Recognition Model Configuration
15
+ model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
16
+ sample_rate = 16000
17
+
18
+ # Download preprocessor, encoder and tokenizer
19
+ preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
20
+ encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
21
+ tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))
22
+
23
+ load_dotenv()
24
+ token=os.getenv("hf_key")
25
+ # Mistral Model Configuration
26
+ client1 = InferenceClient("mistralai/Mistral-7B-Instruct-v0.2" , api_key= token)
27
+ system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
28
+
29
+ def resample(audio_fp32, sr):
30
+ return soxr.resample(audio_fp32, sr, sample_rate)
31
+
32
+ def to_float32(audio_buffer):
33
+ return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
34
+
35
+ def transcribe(audio_path):
36
+ audio_file = AudioSegment.from_file(audio_path)
37
+ sr = audio_file.frame_rate
38
+ audio_buffer = np.array(audio_file.get_array_of_samples())
39
+
40
+ audio_fp32 = to_float32(audio_buffer)
41
+ audio_16k = resample(audio_fp32, sr)
42
+
43
+ input_signal = torch.tensor(audio_16k).unsqueeze(0)
44
+ length = torch.tensor(len(audio_16k)).unsqueeze(0)
45
+ processed_signal, _ = preprocessor.forward(input_signal=input_signal, length=length)
46
+
47
+ logits = encoder.run(None, {'audio_signal': processed_signal.numpy(), 'length': length.numpy()})[0][0]
48
+
49
+ blank_id = tokenizer.vocab_size()
50
+ decoded_prediction = [p for p in logits.argmax(axis=1).tolist() if p != blank_id]
51
+ text = tokenizer.decode_ids(decoded_prediction)
52
+
53
+ return text
54
+
55
+ def model(text):
56
+ formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]"
57
+ stream = client1.text_generation(formatted_prompt, max_new_tokens=300)
58
+ return stream[:-4]
59
+
60
+ async def respond(audio):
61
+ user = transcribe(audio)
62
+ reply = model(user)
63
+ communicate = edge_tts.Communicate(reply)
64
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
65
+ tmp_path = tmp_file.name
66
+ await communicate.save(tmp_path)
67
+ return tmp_path