Spaces:

Kr08
/

ASR

Sleeping

App Files Files Community

ASR / chunkedTranscriber.py

Kr08

Update chunkedTranscriber.py

18d9f31 verified about 2 months ago

raw

history blame

14.8 kB

	import os
	import gc
	import sys
	import time
	import torch
	import spaces
	import torchaudio
	import numpy as np
	from scipy.signal import resample
	from pyannote.audio import Pipeline
	from dotenv import load_dotenv
	load_dotenv()
	from difflib import SequenceMatcher
	from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor, Wav2Vec2ForCTC, AutoProcessor, AutoTokenizer, AutoModelForSeq2SeqLM
	from difflib import SequenceMatcher
	import logging

	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[logging.StreamHandler(sys.stdout)]
	)
	logger = logging.getLogger(__name__)

	class ChunkedTranscriber:
	def __init__(self, chunk_size=5, overlap=1, sample_rate=16000):
	self.chunk_size = chunk_size
	self.overlap = overlap
	self.sample_rate = sample_rate
	self.previous_text = ""
	self.previous_lang = None
	self.speaker_diarization_pipeline = self.load_speaker_diarization_pipeline()

	def load_speaker_diarization_pipeline(self):
	"""
	Load the pre-trained speaker diarization pipeline from pyannote-audio.
	"""
	pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=os.getenv("HF_TOKEN"))
	return pipeline

	@spaces.GPU(duration=60)
	def diarize_audio(self, audio_path):
	"""
	Perform speaker diarization on the input audio.
	"""
	diarization_result = self.speaker_diarization_pipeline({"uri": "audio", "audio": audio_path})
	return diarization_result

	def load_lid_mms(self):
	model_id = "facebook/mms-lid-256"
	processor = AutoFeatureExtractor.from_pretrained(model_id)
	model = Wav2Vec2ForSequenceClassification.from_pretrained(model_id)
	return processor, model


	@spaces.GPU(duration=60)
	def language_identification(self, model, processor, chunk, device="cuda"):
	inputs = processor(chunk, sampling_rate=16_000, return_tensors="pt")
	model.to(device)
	inputs.to(device)
	with torch.no_grad():
	outputs = model(**inputs).logits

	lang_id = torch.argmax(outputs, dim=-1)[0].item()
	detected_lang = model.config.id2label[lang_id]
	del model
	del inputs
	torch.cuda.empty_cache()
	gc.collect()
	return detected_lang


	def load_mms(self) :
	model_id = "facebook/mms-1b-all"
	processor = AutoProcessor.from_pretrained(model_id)
	model = Wav2Vec2ForCTC.from_pretrained(model_id)
	return model, processor


	@spaces.GPU(duration=60)
	def mms_transcription(self, model, processor, chunk, device="cuda"):

	inputs = processor(chunk, sampling_rate=16_000, return_tensors="pt")
	model.to(device)
	inputs.to(device)
	with torch.no_grad():
	outputs = model(**inputs).logits

	ids = torch.argmax(outputs, dim=-1)[0]
	transcription = processor.decode(ids)
	del model
	del inputs
	torch.cuda.empty_cache()
	gc.collect()
	return transcription


	def load_T2T_translation_model(self) :
	model_id = "facebook/nllb-200-distilled-600M"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
	return model, tokenizer


	@spaces.GPU(duration=60)
	def text2text_translation(self, translation_model, translation_tokenizer, transcript, device="cuda"):
	# model, tokenizer = load_translation_model()

	tokenized_inputs = translation_tokenizer(transcript, return_tensors='pt')
	translation_model.to(device)
	tokenized_inputs.to(device)
	translated_tokens = translation_model.generate(**tokenized_inputs,
	forced_bos_token_id=translation_tokenizer.convert_tokens_to_ids("eng_Latn"),
	max_length=100)
	del translation_model
	del tokenized_inputs
	torch.cuda.empty_cache()
	gc.collect()
	return translation_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]


	def preprocess_audio(self, audio):
	"""
	Create overlapping chunks with improved timing logic
	"""
	chunk_samples = int(self.chunk_size * self.sample_rate)
	overlap_samples = int(self.overlap * self.sample_rate)

	chunks_with_times = []
	start_idx = 0

	while start_idx < len(audio):
	end_idx = min(start_idx + chunk_samples, len(audio))

	# Add padding for first chunk
	if start_idx == 0:
	chunk = audio[start_idx:end_idx]
	padding = torch.zeros(int(1 * self.sample_rate))
	chunk = torch.cat([padding, chunk])
	else:
	# Include overlap from previous chunk
	actual_start = max(0, start_idx - overlap_samples)
	chunk = audio[actual_start:end_idx]

	# Pad if necessary
	if len(chunk) < chunk_samples:
	chunk = torch.nn.functional.pad(chunk, (0, chunk_samples - len(chunk)))

	# Adjust time ranges to account for overlaps
	chunk_start_time = max(0, (start_idx / self.sample_rate) - self.overlap)
	chunk_end_time = min((end_idx / self.sample_rate) + self.overlap, len(audio) / self.sample_rate)

	chunks_with_times.append({
	'chunk': chunk,
	'start_time': start_idx / self.sample_rate,
	'end_time': end_idx / self.sample_rate,
	'transcribe_start': chunk_start_time,
	'transcribe_end': chunk_end_time
	})

	# Move to next chunk with smaller step size for better continuity
	start_idx += (chunk_samples - overlap_samples)

	return chunks_with_times


	def merge_close_segments(self, results):
	"""
	Merge segments that are close in time and have the same language
	"""
	if not results:
	return results

	merged = []
	current = results[0]

	for next_segment in results[1:]:
	# Skip empty segments
	if not next_segment['text'].strip():
	continue

	# If segments are in the same language and close in time
	if (current['detected_language'] == next_segment['detected_language'] and
	abs(next_segment['start_time'] - current['end_time']) <= self.overlap):

	# Merge the segments
	current['text'] = current['text'] + ' ' + next_segment['text']
	current['end_time'] = next_segment['end_time']
	if 'translated' in current and 'translated' in next_segment:
	current['translated'] = current['translated'] + ' ' + next_segment['translated']
	else:
	if current['text'].strip(): # Only add non-empty segments
	merged.append(current)
	current = next_segment

	if current['text'].strip(): # Add the last segment if non-empty
	merged.append(current)

	return merged


	def clean_overlapping_text(self, current_text, prev_text, current_lang, prev_lang, min_overlap=3):
	"""
	Improved text cleaning with language awareness and better sentence boundary handling
	"""
	if not prev_text or not current_text:
	return current_text

	# If languages are different, don't try to merge
	if prev_lang and current_lang and prev_lang != current_lang:
	return current_text

	# Split into words
	prev_words = prev_text.split()
	curr_words = current_text.split()

	if len(prev_words) < 2 or len(curr_words) < 2:
	return current_text

	# Find matching sequences at the end of prev_text and start of current_text
	matcher = SequenceMatcher(None, prev_words, curr_words)
	matches = list(matcher.get_matching_blocks())

	# Look for significant overlaps
	best_overlap = 0
	overlap_size = 0

	for match in matches:
	# Check if the match is at the start of current text
	if match.b == 0 and match.size >= min_overlap:
	if match.size > overlap_size:
	best_overlap = match.size
	overlap_size = match.size

	if best_overlap > 0:
	# Remove overlapping content while preserving sentence integrity
	cleaned_words = curr_words[best_overlap:]
	if not cleaned_words: # If everything was overlapping
	return ""
	return ' '.join(cleaned_words).strip()

	return current_text


	def process_chunk(self, chunk_data, mms_model, mms_processor, translation_model=None, translation_tokenizer=None):
	"""
	Process chunk with improved language handling
	"""
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	try:
	# Language detection
	lid_processor, lid_model = self.load_lid_mms()
	lid_lang = self.language_identification(lid_model, lid_processor, chunk_data['chunk'])

	# Configure processor
	mms_processor.tokenizer.set_target_lang(lid_lang)
	mms_model.load_adapter(lid_lang)

	# Transcribe
	inputs = mms_processor(chunk_data['chunk'], sampling_rate=self.sample_rate, return_tensors="pt")
	inputs = inputs.to(device)
	mms_model = mms_model.to(device)

	with torch.no_grad():
	outputs = mms_model(**inputs).logits

	ids = torch.argmax(outputs, dim=-1)[0]
	transcription = mms_processor.decode(ids)

	# Clean overlapping text with language awareness
	cleaned_transcription = self.clean_overlapping_text(
	transcription,
	self.previous_text,
	lid_lang,
	self.previous_lang,
	min_overlap=3
	)

	# Update previous state
	self.previous_text = transcription
	self.previous_lang = lid_lang

	if not cleaned_transcription.strip():
	return None

	result = {
	'start_time': chunk_data['start_time'],
	'end_time': chunk_data['end_time'],
	'text': cleaned_transcription,
	'detected_language': lid_lang
	}

	# Handle translation
	if translation_model and translation_tokenizer and cleaned_transcription.strip():
	translation = self.text2text_translation(
	translation_model,
	translation_tokenizer,
	cleaned_transcription
	)
	result['translated'] = translation

	return result

	except Exception as e:
	print(f"Error processing chunk: {str(e)}")
	return None
	finally:
	torch.cuda.empty_cache()
	gc.collect()


	def translate_text(self, text, translation_model, translation_tokenizer, device):
	"""
	Translate cleaned text using the provided translation model.
	"""
	tokenized_inputs = translation_tokenizer(text, return_tensors='pt')
	tokenized_inputs = tokenized_inputs.to(device)
	translation_model = translation_model.to(device)

	translated_tokens = translation_model.generate(
	**tokenized_inputs,
	forced_bos_token_id=translation_tokenizer.convert_tokens_to_ids("eng_Latn"),
	max_length=100
	)

	translation = translation_tokenizer.batch_decode(
	translated_tokens,
	skip_special_tokens=True
	)[0]

	del translation_model
	del tokenized_inputs
	torch.cuda.empty_cache()
	gc.collect()
	return translation



	def transcribe_audio(self, audio_path, translate=False):
	"""
	Main transcription function with improved segment merging
	"""
	# Perform speaker diarization
	diarization_result = self.diarize_audio(audio_path)

	# Extract speaker segments
	speaker_segments = []

	for turn, _, speaker in diarization_result.itertracks(yield_label=True):
	speaker_segments.append({
	'start_time': turn.start,
	'end_time': turn.end,
	'speaker': speaker
	})

	audio = self.load_audio(audio_path)
	chunks = self.preprocess_audio(audio)

	mms_model, mms_processor = self.load_mms()
	translation_model, translation_tokenizer = None, None
	if translate:
	translation_model, translation_tokenizer = self.load_T2T_translation_model()

	# Process chunks
	results = []
	for chunk_data in chunks:
	result = self.process_chunk(
	chunk_data,
	mms_model,
	mms_processor,
	translation_model,
	translation_tokenizer
	)
	if result:
	for segment in speaker_segments:
	if int(segment['start_time']) <= int(chunk_data['start_time']) < int(segment['end_time']):
	result['speaker'] = segment['speaker']
	break
	results.append(result)
	# results.append(result)

	# Merge close segments and clean up

	merged_results = self.merge_close_segments(results)

	_translation = ""
	_output = ""
	for res in merged_results:
	_translation+=res['translated']
	_output+=f"{res['start_time']}-{res['end_time']} - Speaker: {res['speaker'].split('_')[1]} - Language: {res['detected_language']}\n Text: {res['text']}\n Translation: {res['translated']}\n\n"
	logger.info(f"\n\n TRANSLATION: {_translation}")
	return _translation, _output


	def load_audio(self, audio_path):
	"""
	Load and preprocess audio file.
	"""
	waveform, sample_rate = torchaudio.load(audio_path)

	# Convert to mono if stereo
	if waveform.shape[0] > 1:
	waveform = torch.mean(waveform, dim=0)
	else:
	waveform = waveform.squeeze(0)

	# Resample if necessary
	if sample_rate != self.sample_rate:
	resampler = torchaudio.transforms.Resample(
	orig_freq=sample_rate,
	new_freq=self.sample_rate
	)
	waveform = resampler(waveform)

	return waveform.float()