dindizz's picture
Update app.py
394082a verified
import re
import requests
import gradio as gr
import json
def retrieve_transcript(video_url):
# Extract video ID from the URL
video_id_match = re.search(r"v=([a-zA-Z0-9_-]+)", video_url)
if not video_id_match:
return "Invalid YouTube URL. Please ensure it contains a valid video ID."
video_id = video_id_match.group(1)
# Regular expression to extract ytInitialPlayerResponse
YT_INITIAL_PLAYER_RESPONSE_RE = re.compile(
r"ytInitialPlayerResponse\s*=\s*({.+?})\s*;\s*(?:var\s+(?:meta|head)|</script|\n)"
)
# Fetch the YouTube page
response = requests.get(f"https://www.youtube.com/watch?v={video_id}")
if response.status_code != 200:
return "Unable to fetch video page. Please check the URL or your connection."
# Extract ytInitialPlayerResponse from the page
match = YT_INITIAL_PLAYER_RESPONSE_RE.search(response.text)
if not match:
return "Unable to parse player response. The video may not have captions available."
# Parse the player response JSON using json.loads
player = match.group(1)
try:
player = json.loads(player) # Use json.loads for secure parsing
except json.JSONDecodeError:
return "Error decoding YouTube response. The response format may have changed."
# Check if videoDetails exists
if "videoDetails" not in player:
return "The video details could not be found. The video might be private, restricted, or unavailable."
# Extract metadata
metadata = {
"title": player["videoDetails"].get("title", "Unknown Title"),
"duration": player["videoDetails"].get("lengthSeconds", "Unknown Duration"),
"author": player["videoDetails"].get("author", "Unknown Author"),
"views": player["videoDetails"].get("viewCount", "Unknown Views"),
}
# Get the tracks and sort them by priority
tracks = player.get("captions", {}).get("playerCaptionsTracklistRenderer", {}).get("captionTracks", [])
if not tracks:
return f"Title: {metadata['title']}\n\nNo captions available for this video."
# Sort tracks by priority
tracks.sort(compare_tracks)
# Fetch the transcript
transcript_response = requests.get(tracks[0]["baseUrl"] + "&fmt=json3")
if transcript_response.status_code != 200:
return "Unable to fetch transcript. Please try again later."
transcript = transcript_response.json()
# Parse the transcript
parsed_transcript = (
" ".join(
[
" ".join([seg["utf8"] for seg in event["segs"]])
for event in transcript["events"]
if "segs" in event
]
)
.replace("\u200B", "")
.replace("\u200C", "")
.replace("\u200D", "")
.replace("\uFEFF", "")
.replace("\s+", " ")
)
return f"Title: {metadata['title']}\nAuthor: {metadata['author']}\nViews: {metadata['views']}\nDuration: {metadata['duration']} seconds\n\nTranscript:\n{parsed_transcript}"
def compare_tracks(track1, track2):
lang_code1 = track1.get("languageCode", "")
lang_code2 = track2.get("languageCode", "")
# Sort by priority: English > Non-English, Non-ASR > ASR
if lang_code1 == "en" and lang_code2 != "en":
return -1
elif lang_code1 != "en" and lang_code2 == "en":
return 1
elif track1.get("kind") != "asr" and track2.get("kind") == "asr":
return -1
elif track1.get("kind") == "asr" and track2.get("kind") != "asr":
return 1
return 0 # Preserve order if both have same priority
# Gradio Interface
def gradio_interface(video_url):
return retrieve_transcript(video_url)
# Create Gradio UI
interface = gr.Interface(
fn=gradio_interface,
inputs="text",
outputs="text",
title="YouTube Transcript Extractor",
description="Enter a YouTube video URL to extract the transcript. The video must have captions available.",
)
# Launch the app
if __name__ == "__main__":
interface.launch()