Spaces:

dindizz
/

youtubetonewsstory

Sleeping

App Files Files Community

youtubetonewsstory / app.py

dindizz

Update app.py

394082a verified 2 months ago

raw

history blame contribute delete

4.01 kB

	import re
	import requests
	import gradio as gr
	import json

	def retrieve_transcript(video_url):
	# Extract video ID from the URL
	video_id_match = re.search(r"v=([a-zA-Z0-9_-]+)", video_url)
	if not video_id_match:
	return "Invalid YouTube URL. Please ensure it contains a valid video ID."

	video_id = video_id_match.group(1)

	# Regular expression to extract ytInitialPlayerResponse
	YT_INITIAL_PLAYER_RESPONSE_RE = re.compile(
	r"ytInitialPlayerResponse\s=\s({.+?})\s;\s(?:var\s+(?:meta\|head)\|</script\|\n)"
	)

	# Fetch the YouTube page
	response = requests.get(f"https://www.youtube.com/watch?v={video_id}")
	if response.status_code != 200:
	return "Unable to fetch video page. Please check the URL or your connection."

	# Extract ytInitialPlayerResponse from the page
	match = YT_INITIAL_PLAYER_RESPONSE_RE.search(response.text)
	if not match:
	return "Unable to parse player response. The video may not have captions available."

	# Parse the player response JSON using json.loads
	player = match.group(1)
	try:
	player = json.loads(player) # Use json.loads for secure parsing
	except json.JSONDecodeError:
	return "Error decoding YouTube response. The response format may have changed."

	# Check if videoDetails exists
	if "videoDetails" not in player:
	return "The video details could not be found. The video might be private, restricted, or unavailable."

	# Extract metadata
	metadata = {
	"title": player["videoDetails"].get("title", "Unknown Title"),
	"duration": player["videoDetails"].get("lengthSeconds", "Unknown Duration"),
	"author": player["videoDetails"].get("author", "Unknown Author"),
	"views": player["videoDetails"].get("viewCount", "Unknown Views"),
	}

	# Get the tracks and sort them by priority
	tracks = player.get("captions", {}).get("playerCaptionsTracklistRenderer", {}).get("captionTracks", [])
	if not tracks:
	return f"Title: {metadata['title']}\n\nNo captions available for this video."

	# Sort tracks by priority
	tracks.sort(compare_tracks)

	# Fetch the transcript
	transcript_response = requests.get(tracks[0]["baseUrl"] + "&fmt=json3")
	if transcript_response.status_code != 200:
	return "Unable to fetch transcript. Please try again later."

	transcript = transcript_response.json()

	# Parse the transcript
	parsed_transcript = (
	" ".join(
	[
	" ".join([seg["utf8"] for seg in event["segs"]])
	for event in transcript["events"]
	if "segs" in event
	]
	)
	.replace("\u200B", "")
	.replace("\u200C", "")
	.replace("\u200D", "")
	.replace("\uFEFF", "")
	.replace("\s+", " ")
	)

	return f"Title: {metadata['title']}\nAuthor: {metadata['author']}\nViews: {metadata['views']}\nDuration: {metadata['duration']} seconds\n\nTranscript:\n{parsed_transcript}"


	def compare_tracks(track1, track2):
	lang_code1 = track1.get("languageCode", "")
	lang_code2 = track2.get("languageCode", "")

	# Sort by priority: English > Non-English, Non-ASR > ASR
	if lang_code1 == "en" and lang_code2 != "en":
	return -1
	elif lang_code1 != "en" and lang_code2 == "en":
	return 1
	elif track1.get("kind") != "asr" and track2.get("kind") == "asr":
	return -1
	elif track1.get("kind") == "asr" and track2.get("kind") != "asr":
	return 1
	return 0 # Preserve order if both have same priority


	# Gradio Interface
	def gradio_interface(video_url):
	return retrieve_transcript(video_url)


	# Create Gradio UI
	interface = gr.Interface(
	fn=gradio_interface,
	inputs="text",
	outputs="text",
	title="YouTube Transcript Extractor",
	description="Enter a YouTube video URL to extract the transcript. The video must have captions available.",
	)

	# Launch the app
	if __name__ == "__main__":
	interface.launch()