import os #import re #import functools from functools import partial #import requests #import pandas as pd import torch import gradio as gr from transformers import pipeline, Wav2Vec2ProcessorWithLM from pyannote.audio import Pipeline import whisperx from utils import split from utils import speech_to_text as stt os.environ["TOKENIZERS_PARALLELISM"] = "false" device = 0 if torch.cuda.is_available() else -1 color_map = {"joy": "green","anger": "red","surprise": "yellow","sadness": "blue","fear": "orange","love": "purple",} # Audio components whisper_device = "cuda" if torch.cuda.is_available() else "cpu" whisper = whisperx.load_model("tiny.en", whisper_device) alignment_model, metadata = whisperx.load_align_model(language_code="en", device=whisper_device) speaker_segmentation = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", use_auth_token=os.environ['ENO_TOKEN']) speech_to_text = partial( stt, speaker_segmentation=speaker_segmentation, whisper=whisper, alignment_model=alignment_model, metadata=metadata, whisper_device=whisper_device ) # Text components emotion_pipeline = pipeline( "text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion", ) summarization_pipeline = pipeline( "summarization", model="knkarthick/MEETING_SUMMARY", ) def summarize(diarized, summarization_pipeline): text = "" for d in diarized: text += f"\n{d[1]}: {d[0]}" return summarization_pipeline(text)[0]["summary_text"] def sentiment(diarized, emotion_pipeline): customer_sentiments = [] for i in range(0, len(diarized), 2): speaker_speech, speaker_id = diarized[i] sentences = split(speaker_speech) if "Customer" in speaker_id: outputs = emotion_pipeline(sentences) for idx, (o, t) in enumerate(zip(outputs, sentences)): customer_sentiments.append((t, o["label"])) return customer_sentiments EXAMPLES = [["Customer_Support_Call.wav"]] with gr.Blocks() as demo: with gr.Row(): with gr.Column(): audio = gr.Audio(label="Audio file", type="filepath") btn = gr.Button("Transcribe and Diarize") gr.Markdown("**Call Transcript:**") diarized = gr.HighlightedText(label="Call Transcript") gr.Markdown("Summarize Speaker") sum_btn = gr.Button("Get Summary") summary = gr.Textbox(lines=4) sentiment_btn = gr.Button("Get Customer Sentiment") analyzed = gr.HighlightedText(color_map=color_map) with gr.Column(): gr.Markdown("## Example Files") gr.Examples( examples=EXAMPLES, inputs=[audio], outputs=[diarized], fn=speech_to_text, cache_examples=True ) # when example button is clicked, convert audio file to text and diarize btn.click(fn=speech_to_text, inputs=audio, outputs=diarized) # when summarize checkboxes are changed, create summary sum_btn.click(fn=partial(summarize, summarization_pipeline=summarization_pipeline), inputs=[diarized], outputs=summary) # when sentiment button clicked, display highlighted text and plot sentiment_btn.click(fn=partial(sentiment, emotion_pipeline=emotion_pipeline), inputs=diarized, outputs=[analyzed]) demo.launch(debug=1)