tiennlu commited on
Commit
762fc9f
·
verified ·
1 Parent(s): d744d35

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -3
app.py CHANGED
@@ -268,17 +268,24 @@ def get_video_id(youtube_url):
268
  video_id = parse_qs(parsed_url.query).get("v")
269
  return video_id[0] if video_id else None
270
 
271
-
272
  def get_transcript(video_id):
273
  tran = []
274
  transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
275
- transcript = transcript_list.find_generated_transcript(['vi','en'])
276
  translated_transcript = transcript.translate('en')
277
  transcript_data = translated_transcript.fetch()
278
- tran += [t['text'] for t in transcript_data if t['text'] != '[music]']
 
 
 
 
 
 
 
279
  return ' '.join(tran)
280
 
281
 
 
282
  def chunk_text(text, chunk_size=1000, overlap_size=24):
283
  encoder = RecursiveCharacterTextSplitter().from_tiktoken_encoder(model_name="gpt-3.5-turbo", chunk_size=chunk_size,
284
  chunk_overlap=overlap_size)
 
268
  video_id = parse_qs(parsed_url.query).get("v")
269
  return video_id[0] if video_id else None
270
 
 
271
  def get_transcript(video_id):
272
  tran = []
273
  transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
274
+ transcript = transcript_list.find_generated_transcript(['vi', 'en'])
275
  translated_transcript = transcript.translate('en')
276
  transcript_data = translated_transcript.fetch()
277
+
278
+ words_to_remove = ['[music]', '[clause]', '[smile]', '[laugh]', '[cry]', '[sigh]', '[uh]', '[um]', '[uh-huh]', '[sob]', '[giggle]', '[hmm]']
279
+
280
+
281
+ for t in transcript_data:
282
+ if t['text'].lower() not in words_to_remove:
283
+ tran.append(t['text'])
284
+
285
  return ' '.join(tran)
286
 
287
 
288
+
289
  def chunk_text(text, chunk_size=1000, overlap_size=24):
290
  encoder = RecursiveCharacterTextSplitter().from_tiktoken_encoder(model_name="gpt-3.5-turbo", chunk_size=chunk_size,
291
  chunk_overlap=overlap_size)