Update app.py
Browse files
app.py
CHANGED
@@ -268,17 +268,24 @@ def get_video_id(youtube_url):
|
|
268 |
video_id = parse_qs(parsed_url.query).get("v")
|
269 |
return video_id[0] if video_id else None
|
270 |
|
271 |
-
|
272 |
def get_transcript(video_id):
|
273 |
tran = []
|
274 |
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
275 |
-
transcript = transcript_list.find_generated_transcript(['vi','en'])
|
276 |
translated_transcript = transcript.translate('en')
|
277 |
transcript_data = translated_transcript.fetch()
|
278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
return ' '.join(tran)
|
280 |
|
281 |
|
|
|
282 |
def chunk_text(text, chunk_size=1000, overlap_size=24):
|
283 |
encoder = RecursiveCharacterTextSplitter().from_tiktoken_encoder(model_name="gpt-3.5-turbo", chunk_size=chunk_size,
|
284 |
chunk_overlap=overlap_size)
|
|
|
268 |
video_id = parse_qs(parsed_url.query).get("v")
|
269 |
return video_id[0] if video_id else None
|
270 |
|
|
|
271 |
def get_transcript(video_id):
|
272 |
tran = []
|
273 |
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
274 |
+
transcript = transcript_list.find_generated_transcript(['vi', 'en'])
|
275 |
translated_transcript = transcript.translate('en')
|
276 |
transcript_data = translated_transcript.fetch()
|
277 |
+
|
278 |
+
words_to_remove = ['[music]', '[clause]', '[smile]', '[laugh]', '[cry]', '[sigh]', '[uh]', '[um]', '[uh-huh]', '[sob]', '[giggle]', '[hmm]']
|
279 |
+
|
280 |
+
|
281 |
+
for t in transcript_data:
|
282 |
+
if t['text'].lower() not in words_to_remove:
|
283 |
+
tran.append(t['text'])
|
284 |
+
|
285 |
return ' '.join(tran)
|
286 |
|
287 |
|
288 |
+
|
289 |
def chunk_text(text, chunk_size=1000, overlap_size=24):
|
290 |
encoder = RecursiveCharacterTextSplitter().from_tiktoken_encoder(model_name="gpt-3.5-turbo", chunk_size=chunk_size,
|
291 |
chunk_overlap=overlap_size)
|