Spaces:
Runtime error
Runtime error
load sequence
Browse files- app.py +135 -73
- yt_stats.py +2 -2
app.py
CHANGED
@@ -156,6 +156,96 @@ def get_extracted_text(raw_text):
|
|
156 |
def get_extracted_text_to_dict(raw_text):
|
157 |
st.session_state['extract'] = [raw_text,0,0,0,0]
|
158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
#######################################################################################
|
161 |
# Application Start
|
@@ -247,19 +337,12 @@ st.write("")
|
|
247 |
# Load Transcript
|
248 |
###########################
|
249 |
|
250 |
-
|
251 |
|
252 |
-
|
253 |
-
transcript_item = transcript_list.find_transcript(['en'])
|
254 |
-
transcript_item_is_generated = transcript_item.is_generated
|
255 |
-
transcript_raw = transcript_item.fetch()
|
256 |
-
|
257 |
-
if transcript_raw is None:
|
258 |
st.error("No transcript available.")
|
259 |
st.stop()
|
260 |
|
261 |
-
transcript_text = '\n'.join([i['text'].replace('\n',' ') for i in transcript_raw])
|
262 |
-
|
263 |
########################
|
264 |
# Load Author Keywords, that are not viewable by users
|
265 |
########################
|
@@ -356,26 +439,7 @@ if st.button('Extract Sentences'):
|
|
356 |
st.error('Please run extraction first.', icon="🚨")
|
357 |
else:
|
358 |
|
359 |
-
|
360 |
-
yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
|
361 |
-
yt_img_html_link = '<a href='+url+'>'+yt_img_html+'</a>'
|
362 |
-
video_info = {'ID': [video_id],
|
363 |
-
'Video':[yt_img_html_link],
|
364 |
-
'Author': [st.session_state["video_data"]["Author"][0]],
|
365 |
-
'Channel':[st.session_state["channel_id"]],
|
366 |
-
'Title': [st.session_state["video_data"]["Title"][0]],
|
367 |
-
'Published': [st.session_state["video_data"]["Published"][0]],
|
368 |
-
'Views':[st.session_state["video_data"]["Views"][0]],
|
369 |
-
'Length':[st.session_state["video_data"]["Length"][0]],
|
370 |
-
'Keywords':['; '.join(st.session_state["keywords"])]}
|
371 |
-
|
372 |
-
transcript_info = {'Words':[int(st.session_state.extract[1])],
|
373 |
-
'Sentences': [int(st.session_state.extract[2])],
|
374 |
-
'Characters': [int(st.session_state.extract[3])],
|
375 |
-
'Tokens':[int(st.session_state.extract[4])],
|
376 |
-
'Lextext':[st.session_state.extract[0]],
|
377 |
-
'GPTSummary':[0]}
|
378 |
-
df_current_ts = pd.DataFrame({**video_info,**transcript_info})
|
379 |
|
380 |
# initial write.
|
381 |
#df_new_sheet = pd.concat([df_current_ts])
|
@@ -473,64 +537,62 @@ st.write("")
|
|
473 |
|
474 |
if st.button('Load Videos'):
|
475 |
|
|
|
|
|
|
|
|
|
476 |
progress_text = 'Loading...'
|
477 |
loading_bar = st.progress(0, text=progress_text)
|
478 |
item_limit=3
|
479 |
-
|
|
|
480 |
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
|
485 |
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
vids_lengths = []
|
490 |
-
vids_published= []
|
491 |
-
vids_views= []
|
492 |
-
item=0
|
493 |
-
for video in yt.video_data:
|
494 |
-
if item == item_limit:
|
495 |
-
break
|
496 |
-
item = item+1
|
497 |
|
498 |
-
vids_video_id = video
|
499 |
-
vids_url = 'https://www.youtube.com/watch?v='+vids_video_id
|
500 |
|
501 |
-
|
502 |
-
|
503 |
-
yt_img_html_link = '<a href='+vids_url+'>'+yt_img_html+'</a>'
|
504 |
-
vids_thumbnails.append(yt_img_html_link)
|
505 |
-
|
506 |
-
vids_video_id_link = '<a target="_self" href="/?vid='+vids_video_id+'">'+vids_video_id+'</a>'
|
507 |
-
vids_videoIds.append(vids_video_id_link)
|
508 |
|
509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
vids_published.append(yt_publish_date_formatted)
|
519 |
|
520 |
-
|
521 |
-
|
522 |
-
|
|
|
|
|
523 |
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
'Views':vids_views,
|
529 |
-
'Length':vids_lengths}
|
530 |
|
|
|
531 |
|
532 |
-
|
533 |
-
|
534 |
|
535 |
|
536 |
|
|
|
156 |
def get_extracted_text_to_dict(raw_text):
|
157 |
st.session_state['extract'] = [raw_text,0,0,0,0]
|
158 |
|
159 |
+
def get_videos_from_yt(yt):
|
160 |
+
|
161 |
+
vids_thumbnails = []
|
162 |
+
vids_videoIds = []
|
163 |
+
vids_titles = []
|
164 |
+
vids_lengths = []
|
165 |
+
vids_published= []
|
166 |
+
vids_views= []
|
167 |
+
item=0
|
168 |
+
for video in yt.video_data:
|
169 |
+
if item == item_limit:
|
170 |
+
break
|
171 |
+
item = item+1
|
172 |
+
|
173 |
+
vids_video_id = video
|
174 |
+
vids_url = 'https://www.youtube.com/watch?v='+vids_video_id
|
175 |
+
|
176 |
+
yt_img = f'http://img.youtube.com/vi/{vids_video_id}/mqdefault.jpg'
|
177 |
+
yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
|
178 |
+
yt_img_html_link = '<a href='+vids_url+'>'+yt_img_html+'</a>'
|
179 |
+
vids_thumbnails.append(yt_img_html_link)
|
180 |
+
|
181 |
+
vids_video_id_link = '<a target="_self" href="/?vid='+vids_video_id+'">'+vids_video_id+'</a>'
|
182 |
+
vids_videoIds.append(vids_video_id_link)
|
183 |
+
|
184 |
+
vids_titles.append(yt.video_data[video]['title'])
|
185 |
+
|
186 |
+
yt_length = yt.video_data[video]['duration']
|
187 |
+
yt_length_isodate = isodate.parse_duration(yt_length)
|
188 |
+
yt_length_isoformat = isodate.duration_isoformat(yt_length_isodate, "%H:%M:%S")[1:]
|
189 |
+
vids_lengths.append(yt_length_isoformat)
|
190 |
+
|
191 |
+
yt_publish_date = yt.video_data[video]['publishedAt']
|
192 |
+
yt_publish_date_formatted = datetime.strptime(yt_publish_date, '%Y-%m-%dT%H:%M:%SZ').strftime('%B %d, %Y')
|
193 |
+
vids_published.append(yt_publish_date_formatted)
|
194 |
+
|
195 |
+
yt_views = yt.video_data[video]['viewCount']
|
196 |
+
yt_viws_formatted = format(int(yt_views), ",").replace(",", "'")
|
197 |
+
vids_views.append(yt_viws_formatted)
|
198 |
+
|
199 |
+
df_videos = {'Video': vids_thumbnails,
|
200 |
+
'Video ID':vids_videoIds,
|
201 |
+
'Title':vids_titles,
|
202 |
+
'Published':vids_published,
|
203 |
+
'Views':vids_views,
|
204 |
+
'Length':vids_lengths}
|
205 |
+
|
206 |
+
return df_videos
|
207 |
+
|
208 |
+
def get_transcript(video_id):
|
209 |
+
|
210 |
+
transcript_list = yta.list_transcripts(video_id)
|
211 |
+
|
212 |
+
transcript_raw = None
|
213 |
+
transcript_item = transcript_list.find_transcript(['en'])
|
214 |
+
transcript_item_is_generated = transcript_item.is_generated
|
215 |
+
transcript_raw = transcript_item.fetch()
|
216 |
+
|
217 |
+
if transcript_raw is None:
|
218 |
+
return None
|
219 |
+
|
220 |
+
transcript_text = '\n'.join([i['text'].replace('\n',' ') for i in transcript_raw])
|
221 |
+
|
222 |
+
return transcript_text, transcript_item_is_generated
|
223 |
+
|
224 |
+
def get_meta_info(video_id, url):
|
225 |
+
|
226 |
+
yt_img = f'http://img.youtube.com/vi/{video_id}/mqdefault.jpg'
|
227 |
+
yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
|
228 |
+
yt_img_html_link = '<a href='+url+'>'+yt_img_html+'</a>'
|
229 |
+
video_info = {'ID': [video_id],
|
230 |
+
'Video':[yt_img_html_link],
|
231 |
+
'Author': [st.session_state["video_data"]["Author"][0]],
|
232 |
+
'Channel':[st.session_state["channel_id"]],
|
233 |
+
'Title': [st.session_state["video_data"]["Title"][0]],
|
234 |
+
'Published': [st.session_state["video_data"]["Published"][0]],
|
235 |
+
'Views':[st.session_state["video_data"]["Views"][0]],
|
236 |
+
'Length':[st.session_state["video_data"]["Length"][0]],
|
237 |
+
'Keywords':['; '.join(st.session_state["keywords"])]}
|
238 |
+
|
239 |
+
transcript_info = {'Words':[int(st.session_state.extract[1])],
|
240 |
+
'Sentences': [int(st.session_state.extract[2])],
|
241 |
+
'Characters': [int(st.session_state.extract[3])],
|
242 |
+
'Tokens':[int(st.session_state.extract[4])],
|
243 |
+
'Lextext':[st.session_state.extract[0]],
|
244 |
+
'GPTSummary':[0]}
|
245 |
+
df_current_ts = pd.DataFrame({**video_info,**transcript_info})
|
246 |
+
|
247 |
+
return df_current_ts
|
248 |
+
|
249 |
|
250 |
#######################################################################################
|
251 |
# Application Start
|
|
|
337 |
# Load Transcript
|
338 |
###########################
|
339 |
|
340 |
+
transcript_text, transcript_item_is_generated = get_transcript(video_id)
|
341 |
|
342 |
+
if transcript_text is None:
|
|
|
|
|
|
|
|
|
|
|
343 |
st.error("No transcript available.")
|
344 |
st.stop()
|
345 |
|
|
|
|
|
346 |
########################
|
347 |
# Load Author Keywords, that are not viewable by users
|
348 |
########################
|
|
|
439 |
st.error('Please run extraction first.', icon="🚨")
|
440 |
else:
|
441 |
|
442 |
+
df_current_ts = get_meta_info(video_id, url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
|
444 |
# initial write.
|
445 |
#df_new_sheet = pd.concat([df_current_ts])
|
|
|
537 |
|
538 |
if st.button('Load Videos'):
|
539 |
|
540 |
+
if 'gsheed' not in st.session_state:
|
541 |
+
df = mysheet.read_gspread()
|
542 |
+
st.session_state.gsheed = df
|
543 |
+
|
544 |
progress_text = 'Loading...'
|
545 |
loading_bar = st.progress(0, text=progress_text)
|
546 |
item_limit=3
|
547 |
+
df = st.session_state.gsheed
|
548 |
+
yt.get_channel_video_data(st.session_state["channel_id"],df, loading_bar, progress_text, item_limit)
|
549 |
|
550 |
+
df_videos = get_videos_from_yt(yt)
|
551 |
+
dataset = pd.DataFrame(df_videos)
|
552 |
+
st.markdown(dataset.style.hide(axis="index").to_html(), unsafe_allow_html=True)
|
553 |
|
554 |
|
555 |
+
########################
|
556 |
+
# Sequence Loader
|
557 |
+
########################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
558 |
|
|
|
|
|
559 |
|
560 |
+
st.subheader("Sequence Loader")
|
561 |
+
# input hash as secret
|
|
|
|
|
|
|
|
|
|
|
562 |
|
563 |
+
input_hash = st.text_input("Enter Hash:")
|
564 |
+
if st.button('Load Sequence'):
|
565 |
+
HASH_KEY = st.secrets["hash_key"]
|
566 |
+
if input_hash == HASH_KEY:
|
567 |
+
st.write("Access granted")
|
568 |
+
# read in spreadsheet
|
569 |
+
if 'gsheed' not in st.session_state:
|
570 |
+
df = mysheet.read_gspread()
|
571 |
+
st.session_state.gsheed = df
|
572 |
|
573 |
+
progress_text = 'Loading...'
|
574 |
+
loading_bar = st.progress(0, text=progress_text)
|
575 |
+
item_limit=3
|
576 |
+
df = st.session_state.gsheed
|
577 |
+
yt.get_channel_video_data(st.session_state["channel_id"], df,loading_bar, progress_text, item_limit)
|
578 |
+
df_videos = get_videos_from_yt(yt)
|
579 |
+
dataset = pd.DataFrame(df_videos)
|
|
|
580 |
|
581 |
+
for sng in dataset['Video ID']:
|
582 |
+
subsng = sng[sng.find('>')+1:sng.find('</')]
|
583 |
+
print(subsng)
|
584 |
+
|
585 |
+
transcript_text, transcript_item_is_generated = get_transcript(subsng)
|
586 |
|
587 |
+
if transcript_item_is_generated:
|
588 |
+
get_punctuated_text(transcript_text)
|
589 |
+
else:
|
590 |
+
get_punctuated_text_to_dict(transcript_text)
|
|
|
|
|
591 |
|
592 |
+
get_extracted_text(st.session_state.punkt[0])
|
593 |
|
594 |
+
else:
|
595 |
+
st.write("Access denied")
|
596 |
|
597 |
|
598 |
|
yt_stats.py
CHANGED
@@ -33,7 +33,7 @@ class YTstats:
|
|
33 |
#pbar.close()
|
34 |
return data
|
35 |
|
36 |
-
def get_channel_video_data(self, channel_id, loading_bar, progress_text, item_limit=3):
|
37 |
"Extract all video information of the channel"
|
38 |
print('get video data...')
|
39 |
channel_videos, channel_playlists = self._get_channel_content(channel_id, limit=50)
|
@@ -61,7 +61,7 @@ class YTstats:
|
|
61 |
duration = isodate.parse_duration(channel_videos[video_id]['duration'])
|
62 |
short_duration = isodate.parse_duration('PT4M')
|
63 |
|
64 |
-
if duration > short_duration:
|
65 |
item = item+1
|
66 |
step = step +step_size
|
67 |
channel_videos_out[video_id] = channel_videos[video_id]
|
|
|
33 |
#pbar.close()
|
34 |
return data
|
35 |
|
36 |
+
def get_channel_video_data(self, channel_id, df_sheet, loading_bar, progress_text, item_limit=3):
|
37 |
"Extract all video information of the channel"
|
38 |
print('get video data...')
|
39 |
channel_videos, channel_playlists = self._get_channel_content(channel_id, limit=50)
|
|
|
61 |
duration = isodate.parse_duration(channel_videos[video_id]['duration'])
|
62 |
short_duration = isodate.parse_duration('PT4M')
|
63 |
|
64 |
+
if duration > short_duration and video_id not in list(df_sheet.ID):
|
65 |
item = item+1
|
66 |
step = step +step_size
|
67 |
channel_videos_out[video_id] = channel_videos[video_id]
|