Spaces:

eustlb
/

whisper-vs-distil-whisper-fr

Runtime error

App Files Files Community

whisper-vs-distil-whisper-fr / app.py

trip-fontaine

add whisper tiny

5578aca 7 months ago

raw

history blame contribute delete

8.76 kB

	import spaces
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_available
	from transformers.pipelines.audio_utils import ffmpeg_read
	import torch
	import gradio as gr
	import time
	import copy
	import numpy as np

	BATCH_SIZE = 16
	MAX_AUDIO_MINS = 30 # maximum audio input in minutes
	N_WARMUP = 3

	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else "sdpa" if is_torch_sdpa_available() else "eager"

	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	"openai/whisper-large-v3", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation=attn_implementation
	)
	distilled_model = AutoModelForSpeechSeq2Seq.from_pretrained(
	"eustlb/distil-large-v3-fr", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation=attn_implementation
	)
	tiny_model = AutoModelForSpeechSeq2Seq.from_pretrained(
	"openai/whisper-tiny", torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, attn_implementation=attn_implementation
	)

	processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
	processor_tiny = AutoProcessor.from_pretrained("openai/whisper-tiny")

	model.to(device)
	distilled_model.to(device)
	tiny_model.to(device)

	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=30,
	torch_dtype=torch_dtype,
	device=device,
	generate_kwargs={"language": "fr", "task": "transcribe"},
	return_timestamps=True
	)
	pipe_forward = pipe._forward

	distil_pipe = pipeline(
	"automatic-speech-recognition",
	model=distilled_model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=25,
	torch_dtype=torch_dtype,
	device=device,
	generate_kwargs={"language": "fr", "task": "transcribe"},
	)
	distil_pipe_forward = distil_pipe._forward

	tiny_pipe = pipeline(
	"automatic-speech-recognition",
	model=tiny_model,
	tokenizer=processor_tiny.tokenizer,
	feature_extractor=processor_tiny.feature_extractor,
	max_new_tokens=128,
	chunk_length_s=30,
	torch_dtype=torch_dtype,
	device=device,
	generate_kwargs={"language": "fr", "task": "transcribe"},
	)
	tiny_pipe_forward = tiny_pipe._forward


	def warmup():
	inputs = np.random.randn(30 * pipe.feature_extractor.sampling_rate)
	inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}

	for _ in range(N_WARMUP):
	_ = pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
	_ = distil_pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
	_ = tiny_pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]

	@spaces.GPU
	def transcribe(inputs):
	# warmup the gpu
	print("Warming up...")
	warmup()
	print("Models warmed up!")

	if inputs is None:
	raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")

	with open(inputs, "rb") as f:
	inputs = f.read()

	inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
	audio_length_mins = len(inputs) / pipe.feature_extractor.sampling_rate / 60

	if audio_length_mins > MAX_AUDIO_MINS:
	raise gr.Error(
	f"To ensure fair usage of the Space, the maximum audio length permitted is {MAX_AUDIO_MINS} minutes."
	f"Got an audio of length {round(audio_length_mins, 3)} minutes."
	)

	inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}

	def _forward_distil_time(args, *kwargs):
	global distil_runtime
	start_time = time.time()
	result = distil_pipe_forward(args, *kwargs)
	distil_runtime = time.time() - start_time
	distil_runtime = round(distil_runtime, 2)
	return result

	distil_pipe._forward = _forward_distil_time
	distil_text = distil_pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
	yield distil_text, distil_runtime, None, None, None, None

	def _forward_tiny_time(args, *kwargs):
	global tiny_runtime
	start_time = time.time()
	result = tiny_pipe_forward(args, *kwargs)
	tiny_runtime = time.time() - start_time
	tiny_runtime = round(tiny_runtime, 2)
	return result

	tiny_pipe._forward = _forward_tiny_time
	tiny_text = tiny_pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
	yield distil_text, distil_runtime, tiny_text, tiny_runtime, None, None

	def _forward_time(args, *kwargs):
	global runtime
	start_time = time.time()
	result = pipe_forward(args, *kwargs)
	runtime = time.time() - start_time
	runtime = round(runtime, 2)
	return result

	pipe._forward = _forward_time
	text = pipe(inputs.copy(), batch_size=BATCH_SIZE)["text"]
	yield distil_text, distil_runtime, tiny_text, tiny_runtime, text, runtime

	if __name__ == "__main__":
	with gr.Blocks() as demo:
	gr.HTML(
	"""
	<div style="text-align: center; max-width: 700px; margin: 0 auto;">
	<div
	style="
	display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
	"
	>
	<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
	Whisper vs. distil-large-v3-fr: Speed Comparison 🏎️
	</h1>
	</div>
	</div>
	"""
	)
	gr.HTML(
	f"""
	<p>🚀 <a href="https://huggingface.co/eustlb/distil-large-v3-fr">distil-large-v3-fr</a> is a distilled variant of the <a href="https://huggingface.co/openai/whisper-large-v3">Whisper</a> model by OpenAI. Compared to Whisper, this French ASR distilled model runs 6x faster with 50% fewer parameters, while performing to within 1% word error rate (WER) on out-of-distribution evaluation data. It is also faster than the <a href="https://huggingface.co/openai/whisper-tiny">tiniest version of Whisper</a> while being incomparably more accurate (see <a href="https://huggingface.co/eustlb/distil-large-v3-fr#results">results</a>).</p>

	<p>🛠️ In this demo, we perform a speed comparison between: <a href="https://huggingface.co/openai/whisper-large-v3">Whisper large-v3</a>, <a href="https://huggingface.co/openai/whisper-tiny">Whisper tiny</a> and <a href="https://huggingface.co/eustlb/distil-large-v3-fr">distil-large-3-fr</a> to test this claim. Models use the <a href="https://huggingface.co/distil-whisper/distil-large-v3#chunked-long-form">chunked long-form transcription algorithm</a> in 🤗 Transformers.

	To use <a href="https://huggingface.co/eustlb/distil-large-v3-fr">distil-large-3-fr</a>, check the <a href="https://huggingface.co/eustlb/distil-large-v3-fr#transformers-usage">model card</a>! ⚙️</p>

	<p>⏱️ To ensure fair usage of the Space, we ask that audio file inputs are kept to less than 30 mins.</p>
	"""
	)
	audio = gr.components.Audio(type="filepath", label="Audio input")
	button = gr.Button("Transcribe")
	with gr.Row():
	distil_runtime = gr.components.Textbox(label="distil-large-v3 Transcription Time (s)")
	tiny_runtime = gr.components.Textbox(label="whisper-tiny Transcription Time (s)")
	runtime = gr.components.Textbox(label="whisper-largel-v3 Transcription Time (s)")

	with gr.Row():
	distil_transcription = gr.components.Textbox(label="distil-large-v3 Transcription", show_copy_button=True)
	tiny_transcription = gr.components.Textbox(label="whisper-tiny Transcription", show_copy_button=True)
	transcription = gr.components.Textbox(label="whisper-largel-v3 Transcription", show_copy_button=True)
	button.click(
	fn=transcribe,
	inputs=audio,
	outputs=[distil_transcription, distil_runtime, tiny_transcription, tiny_runtime, transcription, runtime],
	)
	gr.Markdown("## Examples")
	gr.Examples(
	[["./assets/example_1.wav"], ["./assets/example_2.wav"]],
	audio,
	outputs=[distil_transcription, distil_runtime, tiny_transcription, tiny_runtime, transcription, runtime],
	fn=transcribe,
	cache_examples=False,
	)
	demo.queue(max_size=10).launch()