File size: 2,212 Bytes
483362e
 
eb21f26
483362e
8dbb96b
 
d311eed
8dbb96b
 
 
1e61fa7
6b4a273
8dbb96b
 
 
 
1e61fa7
6b4a273
8dbb96b
 
483362e
 
eb21f26
0cc869c
eb21f26
 
 
 
0cc869c
eb21f26
 
a64dd7c
a8e533c
 
 
a64dd7c
03eb644
e93ec4d
eb21f26
f1e1b93
eb21f26
483362e
 
 
84d64d1
0cc869c
d311eed
eb21f26
 
0cc869c
483362e
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from transformers import pipeline
import gradio as gr
import time

pipe_fine = pipeline(model="zeihers-mart/whisper-small-swedish-basic", device_map="auto") 
pipe_raw = pipeline(model="openai/whisper-small", device_map="auto")
sa = pipeline('sentiment-analysis', model='marma/bert-base-swedish-cased-sentiment')

# force swedish
pipe_fine.model.config.forced_decoder_ids = (
    pipe_fine.tokenizer.get_decoder_prompt_ids(
        language="sv", task="transcribe"
    )
)

pipe_raw.model.config.forced_decoder_ids = (
    pipe_raw.tokenizer.get_decoder_prompt_ids(
        language="sv", task="transcribe"
    )
)

def transcribe(audio):
    start = time.time()
    text_sv = pipe_fine(audio)["text"]
    time_fine = time.time() - start
    print(f"Fine-tuned: audio transcribed in {time_fine} seconds: {text_sv}")

    start = time.time()
    text_raw= pipe_raw(audio)["text"]
    time_raw = time.time() - start
    print(f"Raw: audio transcribed in {time_raw} seconds: {text_raw}")
    
    sentiment= sa(text_sv)
    print(f"Sentiment result: {sentiment}")
    sentiment= sentiment[0]["label"]
    happy_path = "https://upload.wikimedia.org/wikipedia/commons/thumb/e/e0/SNice.svg/1200px-SNice.svg.png"
    sad_path = "https://upload.wikimedia.org/wikipedia/commons/thumb/4/42/Sad_smiley_yellow_simple.svg/1024px-Sad_smiley_yellow_simple.svg.png"
    path = happy_path if sentiment == "POSITIVE" else sad_path
    
    description = f"The fine-tuned model took {time_fine} seconds while the original Whisper model took {time_raw} seconds.\nThe sentiment was evaluated from the fine-tuned model transcription as {sentiment.lower()}."
    return text_sv, text_raw, path, description

iface = gr.Interface(
    fn=transcribe, 
    inputs=gr.Audio(sources=["microphone"], type="filepath"), 
    outputs=[gr.Textbox(label="Fine-tuned transcription"),
             gr.Textbox(label="Whisper transcription"),
             gr.Image(label="Sentiment from Fine-tuned transcription", width=250, height=250),
             gr.Textbox(label="Description")],
    title="Finetuned Whisper Swedish Small",
    description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.",
)

iface.launch()