Spaces:
Sleeping
Sleeping
import re | |
import requests | |
import gradio as gr | |
import json | |
def retrieve_transcript(video_url): | |
# Extract video ID from the URL | |
video_id_match = re.search(r"v=([a-zA-Z0-9_-]+)", video_url) | |
if not video_id_match: | |
return "Invalid YouTube URL. Please ensure it contains a valid video ID." | |
video_id = video_id_match.group(1) | |
# Regular expression to extract ytInitialPlayerResponse | |
YT_INITIAL_PLAYER_RESPONSE_RE = re.compile( | |
r"ytInitialPlayerResponse\s*=\s*({.+?})\s*;\s*(?:var\s+(?:meta|head)|</script|\n)" | |
) | |
# Fetch the YouTube page | |
response = requests.get(f"https://www.youtube.com/watch?v={video_id}") | |
if response.status_code != 200: | |
return "Unable to fetch video page. Please check the URL or your connection." | |
# Extract ytInitialPlayerResponse from the page | |
match = YT_INITIAL_PLAYER_RESPONSE_RE.search(response.text) | |
if not match: | |
return "Unable to parse player response. The video may not have captions available." | |
# Parse the player response JSON using json.loads | |
player = match.group(1) | |
try: | |
player = json.loads(player) # Use json.loads for secure parsing | |
except json.JSONDecodeError: | |
return "Error decoding YouTube response. The response format may have changed." | |
# Check if videoDetails exists | |
if "videoDetails" not in player: | |
return "The video details could not be found. The video might be private, restricted, or unavailable." | |
# Extract metadata | |
metadata = { | |
"title": player["videoDetails"].get("title", "Unknown Title"), | |
"duration": player["videoDetails"].get("lengthSeconds", "Unknown Duration"), | |
"author": player["videoDetails"].get("author", "Unknown Author"), | |
"views": player["videoDetails"].get("viewCount", "Unknown Views"), | |
} | |
# Get the tracks and sort them by priority | |
tracks = player.get("captions", {}).get("playerCaptionsTracklistRenderer", {}).get("captionTracks", []) | |
if not tracks: | |
return f"Title: {metadata['title']}\n\nNo captions available for this video." | |
# Sort tracks by priority | |
tracks.sort(compare_tracks) | |
# Fetch the transcript | |
transcript_response = requests.get(tracks[0]["baseUrl"] + "&fmt=json3") | |
if transcript_response.status_code != 200: | |
return "Unable to fetch transcript. Please try again later." | |
transcript = transcript_response.json() | |
# Parse the transcript | |
parsed_transcript = ( | |
" ".join( | |
[ | |
" ".join([seg["utf8"] for seg in event["segs"]]) | |
for event in transcript["events"] | |
if "segs" in event | |
] | |
) | |
.replace("\u200B", "") | |
.replace("\u200C", "") | |
.replace("\u200D", "") | |
.replace("\uFEFF", "") | |
.replace("\s+", " ") | |
) | |
return f"Title: {metadata['title']}\nAuthor: {metadata['author']}\nViews: {metadata['views']}\nDuration: {metadata['duration']} seconds\n\nTranscript:\n{parsed_transcript}" | |
def compare_tracks(track1, track2): | |
lang_code1 = track1.get("languageCode", "") | |
lang_code2 = track2.get("languageCode", "") | |
# Sort by priority: English > Non-English, Non-ASR > ASR | |
if lang_code1 == "en" and lang_code2 != "en": | |
return -1 | |
elif lang_code1 != "en" and lang_code2 == "en": | |
return 1 | |
elif track1.get("kind") != "asr" and track2.get("kind") == "asr": | |
return -1 | |
elif track1.get("kind") == "asr" and track2.get("kind") != "asr": | |
return 1 | |
return 0 # Preserve order if both have same priority | |
# Gradio Interface | |
def gradio_interface(video_url): | |
return retrieve_transcript(video_url) | |
# Create Gradio UI | |
interface = gr.Interface( | |
fn=gradio_interface, | |
inputs="text", | |
outputs="text", | |
title="YouTube Transcript Extractor", | |
description="Enter a YouTube video URL to extract the transcript. The video must have captions available.", | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
interface.launch() | |