File size: 1,754 Bytes
a263f35
ea52814
c150302
a263f35
eb6ba59
a263f35
1e629a8
ea52814
 
c150302
a263f35
eb6ba59
 
a263f35
 
 
 
c150302
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a263f35
 
 
c150302
 
a263f35
 
 
 
 
 
c150302
a263f35
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import gradio as gr
from transformers import pipeline, Wav2Vec2ProcessorWithLM
from pyannote.audio import Pipeline
from librosa import load, resample
from rpunct import RestorePuncts

asr_model = 'patrickvonplaten/wav2vec2-base-100h-with-lm'
processor = Wav2Vec2ProcessorWithLM.from_pretrained(asr_model)
asr = pipeline('automatic-speech-recognition', model=asr_model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, decoder=processor.decoder)
speaker_segmentation = Pipeline.from_pretrained("pyannote/speaker-segmentation")

rpunct = RestorePuncts()

def transcribe(filepath):
	speech, sampling_rate = load(filepath)
	if sampling_rate != 16000:
		speech = resample(speech, sampling_rate, 16000)
	speaker_output = speaker_segmentation(speech)
	text = asr(speech, return_timestamps="word")

	full_text = text['text'].lower()
	chunks = text['chunks']

	diarizaed_output = ""
	i = 0
	for turn, _, speaker in speaker_output.itertracks(yield_label=True):
		diarized = ""
		while i < len(chunks) and chunks[i]['timestamp'][1] <= turn.end:
			diarized += chunks[i]['text'].lower() + ' '
			i += 1

		if diarized != "":
			diarized = rpunct.punctuate(diarized)
			diarized_output += "{}: ''{}'' from {:.3f}-{:.3f}\n".format(speaker,diarized,turn.start,turn.end)

	return diarizaed_output, full_text

mic = gr.inputs.Audio(source='microphone', type='filepath', label='Speech input', optional=False)

diarized_transcript = gr.outputs.Textbox(type='auto', label='Diarized Output')
full_transcript = gr.outputs.Textbox(type='auto', label='Full Transcript')

iface = gr.Interface(
	theme='huggingface',
	description='Testing transcription',
	fn=transcribe,
	inputs=[mic],
	outputs=[diarized_transcript, full_transcript]
)
iface.launch()