wldmr commited on
Commit
008c815
·
1 Parent(s): e817fae

load sequence

Browse files
Files changed (2) hide show
  1. app.py +135 -73
  2. yt_stats.py +2 -2
app.py CHANGED
@@ -156,6 +156,96 @@ def get_extracted_text(raw_text):
156
  def get_extracted_text_to_dict(raw_text):
157
  st.session_state['extract'] = [raw_text,0,0,0,0]
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
160
  #######################################################################################
161
  # Application Start
@@ -247,19 +337,12 @@ st.write("")
247
  # Load Transcript
248
  ###########################
249
 
250
- transcript_list = yta.list_transcripts(video_id)
251
 
252
- transcript_raw = None
253
- transcript_item = transcript_list.find_transcript(['en'])
254
- transcript_item_is_generated = transcript_item.is_generated
255
- transcript_raw = transcript_item.fetch()
256
-
257
- if transcript_raw is None:
258
  st.error("No transcript available.")
259
  st.stop()
260
 
261
- transcript_text = '\n'.join([i['text'].replace('\n',' ') for i in transcript_raw])
262
-
263
  ########################
264
  # Load Author Keywords, that are not viewable by users
265
  ########################
@@ -356,26 +439,7 @@ if st.button('Extract Sentences'):
356
  st.error('Please run extraction first.', icon="🚨")
357
  else:
358
 
359
- yt_img = f'http://img.youtube.com/vi/{video_id}/mqdefault.jpg'
360
- yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
361
- yt_img_html_link = '<a href='+url+'>'+yt_img_html+'</a>'
362
- video_info = {'ID': [video_id],
363
- 'Video':[yt_img_html_link],
364
- 'Author': [st.session_state["video_data"]["Author"][0]],
365
- 'Channel':[st.session_state["channel_id"]],
366
- 'Title': [st.session_state["video_data"]["Title"][0]],
367
- 'Published': [st.session_state["video_data"]["Published"][0]],
368
- 'Views':[st.session_state["video_data"]["Views"][0]],
369
- 'Length':[st.session_state["video_data"]["Length"][0]],
370
- 'Keywords':['; '.join(st.session_state["keywords"])]}
371
-
372
- transcript_info = {'Words':[int(st.session_state.extract[1])],
373
- 'Sentences': [int(st.session_state.extract[2])],
374
- 'Characters': [int(st.session_state.extract[3])],
375
- 'Tokens':[int(st.session_state.extract[4])],
376
- 'Lextext':[st.session_state.extract[0]],
377
- 'GPTSummary':[0]}
378
- df_current_ts = pd.DataFrame({**video_info,**transcript_info})
379
 
380
  # initial write.
381
  #df_new_sheet = pd.concat([df_current_ts])
@@ -473,64 +537,62 @@ st.write("")
473
 
474
  if st.button('Load Videos'):
475
 
 
 
 
 
476
  progress_text = 'Loading...'
477
  loading_bar = st.progress(0, text=progress_text)
478
  item_limit=3
479
- yt.get_channel_video_data(st.session_state["channel_id"],loading_bar, progress_text, item_limit)
 
480
 
481
- #with st.spinner('Loading...'):
482
- #yt.get_channel_video_data(st.session_state["channel_id"])
483
- #videos = scrapetube.get_channel(yt.channel_id, limit=3, sleep=2)
484
 
485
 
486
- vids_thumbnails = []
487
- vids_videoIds = []
488
- vids_titles = []
489
- vids_lengths = []
490
- vids_published= []
491
- vids_views= []
492
- item=0
493
- for video in yt.video_data:
494
- if item == item_limit:
495
- break
496
- item = item+1
497
 
498
- vids_video_id = video
499
- vids_url = 'https://www.youtube.com/watch?v='+vids_video_id
500
 
501
- yt_img = f'http://img.youtube.com/vi/{vids_video_id}/mqdefault.jpg'
502
- yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
503
- yt_img_html_link = '<a href='+vids_url+'>'+yt_img_html+'</a>'
504
- vids_thumbnails.append(yt_img_html_link)
505
-
506
- vids_video_id_link = '<a target="_self" href="/?vid='+vids_video_id+'">'+vids_video_id+'</a>'
507
- vids_videoIds.append(vids_video_id_link)
508
 
509
- vids_titles.append(yt.video_data[video]['title'])
 
 
 
 
 
 
 
 
510
 
511
- yt_length = yt.video_data[video]['duration']
512
- yt_length_isodate = isodate.parse_duration(yt_length)
513
- yt_length_isoformat = isodate.duration_isoformat(yt_length_isodate, "%H:%M:%S")[1:]
514
- vids_lengths.append(yt_length_isoformat)
515
-
516
- yt_publish_date = yt.video_data[video]['publishedAt']
517
- yt_publish_date_formatted = datetime.strptime(yt_publish_date, '%Y-%m-%dT%H:%M:%SZ').strftime('%B %d, %Y')
518
- vids_published.append(yt_publish_date_formatted)
519
 
520
- yt_views = yt.video_data[video]['viewCount']
521
- yt_viws_formatted = format(int(yt_views), ",").replace(",", "'")
522
- vids_views.append(yt_viws_formatted)
 
 
523
 
524
- df_videos = {'Video': vids_thumbnails,
525
- 'Video ID':vids_videoIds,
526
- 'Title':vids_titles,
527
- 'Published':vids_published,
528
- 'Views':vids_views,
529
- 'Length':vids_lengths}
530
 
 
531
 
532
- dataset = pd.DataFrame(df_videos)
533
- st.markdown(dataset.style.hide(axis="index").to_html(), unsafe_allow_html=True)
534
 
535
 
536
 
 
156
  def get_extracted_text_to_dict(raw_text):
157
  st.session_state['extract'] = [raw_text,0,0,0,0]
158
 
159
+ def get_videos_from_yt(yt):
160
+
161
+ vids_thumbnails = []
162
+ vids_videoIds = []
163
+ vids_titles = []
164
+ vids_lengths = []
165
+ vids_published= []
166
+ vids_views= []
167
+ item=0
168
+ for video in yt.video_data:
169
+ if item == item_limit:
170
+ break
171
+ item = item+1
172
+
173
+ vids_video_id = video
174
+ vids_url = 'https://www.youtube.com/watch?v='+vids_video_id
175
+
176
+ yt_img = f'http://img.youtube.com/vi/{vids_video_id}/mqdefault.jpg'
177
+ yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
178
+ yt_img_html_link = '<a href='+vids_url+'>'+yt_img_html+'</a>'
179
+ vids_thumbnails.append(yt_img_html_link)
180
+
181
+ vids_video_id_link = '<a target="_self" href="/?vid='+vids_video_id+'">'+vids_video_id+'</a>'
182
+ vids_videoIds.append(vids_video_id_link)
183
+
184
+ vids_titles.append(yt.video_data[video]['title'])
185
+
186
+ yt_length = yt.video_data[video]['duration']
187
+ yt_length_isodate = isodate.parse_duration(yt_length)
188
+ yt_length_isoformat = isodate.duration_isoformat(yt_length_isodate, "%H:%M:%S")[1:]
189
+ vids_lengths.append(yt_length_isoformat)
190
+
191
+ yt_publish_date = yt.video_data[video]['publishedAt']
192
+ yt_publish_date_formatted = datetime.strptime(yt_publish_date, '%Y-%m-%dT%H:%M:%SZ').strftime('%B %d, %Y')
193
+ vids_published.append(yt_publish_date_formatted)
194
+
195
+ yt_views = yt.video_data[video]['viewCount']
196
+ yt_viws_formatted = format(int(yt_views), ",").replace(",", "'")
197
+ vids_views.append(yt_viws_formatted)
198
+
199
+ df_videos = {'Video': vids_thumbnails,
200
+ 'Video ID':vids_videoIds,
201
+ 'Title':vids_titles,
202
+ 'Published':vids_published,
203
+ 'Views':vids_views,
204
+ 'Length':vids_lengths}
205
+
206
+ return df_videos
207
+
208
+ def get_transcript(video_id):
209
+
210
+ transcript_list = yta.list_transcripts(video_id)
211
+
212
+ transcript_raw = None
213
+ transcript_item = transcript_list.find_transcript(['en'])
214
+ transcript_item_is_generated = transcript_item.is_generated
215
+ transcript_raw = transcript_item.fetch()
216
+
217
+ if transcript_raw is None:
218
+ return None
219
+
220
+ transcript_text = '\n'.join([i['text'].replace('\n',' ') for i in transcript_raw])
221
+
222
+ return transcript_text, transcript_item_is_generated
223
+
224
+ def get_meta_info(video_id, url):
225
+
226
+ yt_img = f'http://img.youtube.com/vi/{video_id}/mqdefault.jpg'
227
+ yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
228
+ yt_img_html_link = '<a href='+url+'>'+yt_img_html+'</a>'
229
+ video_info = {'ID': [video_id],
230
+ 'Video':[yt_img_html_link],
231
+ 'Author': [st.session_state["video_data"]["Author"][0]],
232
+ 'Channel':[st.session_state["channel_id"]],
233
+ 'Title': [st.session_state["video_data"]["Title"][0]],
234
+ 'Published': [st.session_state["video_data"]["Published"][0]],
235
+ 'Views':[st.session_state["video_data"]["Views"][0]],
236
+ 'Length':[st.session_state["video_data"]["Length"][0]],
237
+ 'Keywords':['; '.join(st.session_state["keywords"])]}
238
+
239
+ transcript_info = {'Words':[int(st.session_state.extract[1])],
240
+ 'Sentences': [int(st.session_state.extract[2])],
241
+ 'Characters': [int(st.session_state.extract[3])],
242
+ 'Tokens':[int(st.session_state.extract[4])],
243
+ 'Lextext':[st.session_state.extract[0]],
244
+ 'GPTSummary':[0]}
245
+ df_current_ts = pd.DataFrame({**video_info,**transcript_info})
246
+
247
+ return df_current_ts
248
+
249
 
250
  #######################################################################################
251
  # Application Start
 
337
  # Load Transcript
338
  ###########################
339
 
340
+ transcript_text, transcript_item_is_generated = get_transcript(video_id)
341
 
342
+ if transcript_text is None:
 
 
 
 
 
343
  st.error("No transcript available.")
344
  st.stop()
345
 
 
 
346
  ########################
347
  # Load Author Keywords, that are not viewable by users
348
  ########################
 
439
  st.error('Please run extraction first.', icon="🚨")
440
  else:
441
 
442
+ df_current_ts = get_meta_info(video_id, url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
 
444
  # initial write.
445
  #df_new_sheet = pd.concat([df_current_ts])
 
537
 
538
  if st.button('Load Videos'):
539
 
540
+ if 'gsheed' not in st.session_state:
541
+ df = mysheet.read_gspread()
542
+ st.session_state.gsheed = df
543
+
544
  progress_text = 'Loading...'
545
  loading_bar = st.progress(0, text=progress_text)
546
  item_limit=3
547
+ df = st.session_state.gsheed
548
+ yt.get_channel_video_data(st.session_state["channel_id"],df, loading_bar, progress_text, item_limit)
549
 
550
+ df_videos = get_videos_from_yt(yt)
551
+ dataset = pd.DataFrame(df_videos)
552
+ st.markdown(dataset.style.hide(axis="index").to_html(), unsafe_allow_html=True)
553
 
554
 
555
+ ########################
556
+ # Sequence Loader
557
+ ########################
 
 
 
 
 
 
 
 
558
 
 
 
559
 
560
+ st.subheader("Sequence Loader")
561
+ # input hash as secret
 
 
 
 
 
562
 
563
+ input_hash = st.text_input("Enter Hash:")
564
+ if st.button('Load Sequence'):
565
+ HASH_KEY = st.secrets["hash_key"]
566
+ if input_hash == HASH_KEY:
567
+ st.write("Access granted")
568
+ # read in spreadsheet
569
+ if 'gsheed' not in st.session_state:
570
+ df = mysheet.read_gspread()
571
+ st.session_state.gsheed = df
572
 
573
+ progress_text = 'Loading...'
574
+ loading_bar = st.progress(0, text=progress_text)
575
+ item_limit=3
576
+ df = st.session_state.gsheed
577
+ yt.get_channel_video_data(st.session_state["channel_id"], df,loading_bar, progress_text, item_limit)
578
+ df_videos = get_videos_from_yt(yt)
579
+ dataset = pd.DataFrame(df_videos)
 
580
 
581
+ for sng in dataset['Video ID']:
582
+ subsng = sng[sng.find('>')+1:sng.find('</')]
583
+ print(subsng)
584
+
585
+ transcript_text, transcript_item_is_generated = get_transcript(subsng)
586
 
587
+ if transcript_item_is_generated:
588
+ get_punctuated_text(transcript_text)
589
+ else:
590
+ get_punctuated_text_to_dict(transcript_text)
 
 
591
 
592
+ get_extracted_text(st.session_state.punkt[0])
593
 
594
+ else:
595
+ st.write("Access denied")
596
 
597
 
598
 
yt_stats.py CHANGED
@@ -33,7 +33,7 @@ class YTstats:
33
  #pbar.close()
34
  return data
35
 
36
- def get_channel_video_data(self, channel_id, loading_bar, progress_text, item_limit=3):
37
  "Extract all video information of the channel"
38
  print('get video data...')
39
  channel_videos, channel_playlists = self._get_channel_content(channel_id, limit=50)
@@ -61,7 +61,7 @@ class YTstats:
61
  duration = isodate.parse_duration(channel_videos[video_id]['duration'])
62
  short_duration = isodate.parse_duration('PT4M')
63
 
64
- if duration > short_duration:
65
  item = item+1
66
  step = step +step_size
67
  channel_videos_out[video_id] = channel_videos[video_id]
 
33
  #pbar.close()
34
  return data
35
 
36
+ def get_channel_video_data(self, channel_id, df_sheet, loading_bar, progress_text, item_limit=3):
37
  "Extract all video information of the channel"
38
  print('get video data...')
39
  channel_videos, channel_playlists = self._get_channel_content(channel_id, limit=50)
 
61
  duration = isodate.parse_duration(channel_videos[video_id]['duration'])
62
  short_duration = isodate.parse_duration('PT4M')
63
 
64
+ if duration > short_duration and video_id not in list(df_sheet.ID):
65
  item = item+1
66
  step = step +step_size
67
  channel_videos_out[video_id] = channel_videos[video_id]