Spaces:

wldmr
/

transcriptifier-st-hf7

Runtime error

App Files Files Community

wldmr commited on Jun 15, 2023

Commit

008c815

1 Parent(s): e817fae

load sequence

Browse files

Files changed (2) hide show

app.py +135 -73
yt_stats.py +2 -2

app.py CHANGED Viewed

@@ -156,6 +156,96 @@ def get_extracted_text(raw_text):
 def get_extracted_text_to_dict(raw_text):
     st.session_state['extract'] = [raw_text,0,0,0,0]
 #######################################################################################
 # Application Start
@@ -247,19 +337,12 @@ st.write("")
 # Load Transcript
 ###########################
-transcript_list = yta.list_transcripts(video_id)
-transcript_raw = None
-transcript_item = transcript_list.find_transcript(['en'])
-transcript_item_is_generated = transcript_item.is_generated
-transcript_raw = transcript_item.fetch()
-if transcript_raw is None:
     st.error("No transcript available.")
     st.stop()
-transcript_text = '\n'.join([i['text'].replace('\n',' ') for i in transcript_raw])
 ########################
 # Load Author Keywords, that are not viewable by users
 ########################
@@ -356,26 +439,7 @@ if st.button('Extract Sentences'):
         st.error('Please run extraction first.', icon="🚨")
     else:
-        yt_img = f'http://img.youtube.com/vi/{video_id}/mqdefault.jpg'
-        yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
-        yt_img_html_link = '<a href='+url+'>'+yt_img_html+'</a>'
-        video_info = {'ID': [video_id],
-                'Video':[yt_img_html_link],
-                'Author': [st.session_state["video_data"]["Author"][0]],
-                'Channel':[st.session_state["channel_id"]],
-                'Title': [st.session_state["video_data"]["Title"][0]],
-                'Published': [st.session_state["video_data"]["Published"][0]],
-                'Views':[st.session_state["video_data"]["Views"][0]],
-                'Length':[st.session_state["video_data"]["Length"][0]],
-                'Keywords':['; '.join(st.session_state["keywords"])]}
-        transcript_info = {'Words':[int(st.session_state.extract[1])],
-                'Sentences': [int(st.session_state.extract[2])],
-                'Characters': [int(st.session_state.extract[3])],
-                'Tokens':[int(st.session_state.extract[4])],
-                'Lextext':[st.session_state.extract[0]],
-                'GPTSummary':[0]}
-        df_current_ts = pd.DataFrame({**video_info,**transcript_info})
         # initial write.
         #df_new_sheet = pd.concat([df_current_ts])
@@ -473,64 +537,62 @@ st.write("")
 if st.button('Load Videos'):
     progress_text = 'Loading...'
     loading_bar = st.progress(0, text=progress_text)
     item_limit=3
-    yt.get_channel_video_data(st.session_state["channel_id"],loading_bar, progress_text, item_limit)
-    #with st.spinner('Loading...'):
-        #yt.get_channel_video_data(st.session_state["channel_id"])
-        #videos = scrapetube.get_channel(yt.channel_id, limit=3, sleep=2)
-    vids_thumbnails = []
-    vids_videoIds = []
-    vids_titles = []
-    vids_lengths = []
-    vids_published= []
-    vids_views= []
-    item=0
-    for video in yt.video_data:
-        if item == item_limit:
-            break
-        item = item+1
-        vids_video_id = video
-        vids_url = 'https://www.youtube.com/watch?v='+vids_video_id
-        yt_img = f'http://img.youtube.com/vi/{vids_video_id}/mqdefault.jpg'
-        yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
-        yt_img_html_link = '<a href='+vids_url+'>'+yt_img_html+'</a>'
-        vids_thumbnails.append(yt_img_html_link)
-        vids_video_id_link = '<a target="_self" href="/?vid='+vids_video_id+'">'+vids_video_id+'</a>'
-        vids_videoIds.append(vids_video_id_link)
-        vids_titles.append(yt.video_data[video]['title'])
-        yt_length = yt.video_data[video]['duration']
-        yt_length_isodate = isodate.parse_duration(yt_length)
-        yt_length_isoformat = isodate.duration_isoformat(yt_length_isodate, "%H:%M:%S")[1:]
-        vids_lengths.append(yt_length_isoformat)
-        yt_publish_date = yt.video_data[video]['publishedAt']
-        yt_publish_date_formatted = datetime.strptime(yt_publish_date, '%Y-%m-%dT%H:%M:%SZ').strftime('%B %d, %Y')
-        vids_published.append(yt_publish_date_formatted)
-        yt_views = yt.video_data[video]['viewCount']
-        yt_viws_formatted = format(int(yt_views), ",").replace(",", "'")
-        vids_views.append(yt_viws_formatted)
-    df_videos = {'Video': vids_thumbnails,
-                'Video ID':vids_videoIds,
-                'Title':vids_titles,
-                'Published':vids_published,
-                'Views':vids_views,
-                'Length':vids_lengths}
-    dataset = pd.DataFrame(df_videos)
-    st.markdown(dataset.style.hide(axis="index").to_html(), unsafe_allow_html=True)

 def get_extracted_text_to_dict(raw_text):
     st.session_state['extract'] = [raw_text,0,0,0,0]
+def get_videos_from_yt(yt):
+    vids_thumbnails = []
+    vids_videoIds = []
+    vids_titles = []
+    vids_lengths = []
+    vids_published= []
+    vids_views= []
+    item=0
+    for video in yt.video_data:
+        if item == item_limit:
+            break
+        item = item+1
+        vids_video_id = video
+        vids_url = 'https://www.youtube.com/watch?v='+vids_video_id
+        yt_img = f'http://img.youtube.com/vi/{vids_video_id}/mqdefault.jpg'
+        yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
+        yt_img_html_link = '<a href='+vids_url+'>'+yt_img_html+'</a>'
+        vids_thumbnails.append(yt_img_html_link)
+        vids_video_id_link = '<a target="_self" href="/?vid='+vids_video_id+'">'+vids_video_id+'</a>'
+        vids_videoIds.append(vids_video_id_link)
+        vids_titles.append(yt.video_data[video]['title'])
+        yt_length = yt.video_data[video]['duration']
+        yt_length_isodate = isodate.parse_duration(yt_length)
+        yt_length_isoformat = isodate.duration_isoformat(yt_length_isodate, "%H:%M:%S")[1:]
+        vids_lengths.append(yt_length_isoformat)
+        yt_publish_date = yt.video_data[video]['publishedAt']
+        yt_publish_date_formatted = datetime.strptime(yt_publish_date, '%Y-%m-%dT%H:%M:%SZ').strftime('%B %d, %Y')
+        vids_published.append(yt_publish_date_formatted)
+        yt_views = yt.video_data[video]['viewCount']
+        yt_viws_formatted = format(int(yt_views), ",").replace(",", "'")
+        vids_views.append(yt_viws_formatted)
+    df_videos = {'Video': vids_thumbnails,
+                'Video ID':vids_videoIds,
+                'Title':vids_titles,
+                'Published':vids_published,
+                'Views':vids_views,
+                'Length':vids_lengths}
+    return df_videos
+def get_transcript(video_id):
+    transcript_list = yta.list_transcripts(video_id)
+    transcript_raw = None
+    transcript_item = transcript_list.find_transcript(['en'])
+    transcript_item_is_generated = transcript_item.is_generated
+    transcript_raw = transcript_item.fetch()
+    if transcript_raw is None:
+        return None
+    transcript_text = '\n'.join([i['text'].replace('\n',' ') for i in transcript_raw])
+    return transcript_text, transcript_item_is_generated
+def get_meta_info(video_id, url):
+    yt_img = f'http://img.youtube.com/vi/{video_id}/mqdefault.jpg'
+    yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
+    yt_img_html_link = '<a href='+url+'>'+yt_img_html+'</a>'
+    video_info = {'ID': [video_id],
+            'Video':[yt_img_html_link],
+            'Author': [st.session_state["video_data"]["Author"][0]],
+            'Channel':[st.session_state["channel_id"]],
+            'Title': [st.session_state["video_data"]["Title"][0]],
+            'Published': [st.session_state["video_data"]["Published"][0]],
+            'Views':[st.session_state["video_data"]["Views"][0]],
+            'Length':[st.session_state["video_data"]["Length"][0]],
+            'Keywords':['; '.join(st.session_state["keywords"])]}
+    transcript_info = {'Words':[int(st.session_state.extract[1])],
+            'Sentences': [int(st.session_state.extract[2])],
+            'Characters': [int(st.session_state.extract[3])],
+            'Tokens':[int(st.session_state.extract[4])],
+            'Lextext':[st.session_state.extract[0]],
+            'GPTSummary':[0]}
+    df_current_ts = pd.DataFrame({**video_info,**transcript_info})
+    return df_current_ts
 #######################################################################################
 # Application Start
 # Load Transcript
 ###########################
+transcript_text, transcript_item_is_generated = get_transcript(video_id)
+if transcript_text is None:
     st.error("No transcript available.")
     st.stop()
 ########################
 # Load Author Keywords, that are not viewable by users
 ########################
         st.error('Please run extraction first.', icon="🚨")
     else:
+        df_current_ts = get_meta_info(video_id, url)
         # initial write.
         #df_new_sheet = pd.concat([df_current_ts])
 if st.button('Load Videos'):
+    if 'gsheed' not in st.session_state:
+        df = mysheet.read_gspread()
+        st.session_state.gsheed = df
     progress_text = 'Loading...'
     loading_bar = st.progress(0, text=progress_text)
     item_limit=3
+    df = st.session_state.gsheed
+    yt.get_channel_video_data(st.session_state["channel_id"],df, loading_bar, progress_text, item_limit)
+    df_videos = get_videos_from_yt(yt)
+    dataset = pd.DataFrame(df_videos)
+    st.markdown(dataset.style.hide(axis="index").to_html(), unsafe_allow_html=True)
+########################
+# Sequence Loader
+########################
+st.subheader("Sequence Loader")
+# input hash as secret
+input_hash = st.text_input("Enter Hash:")
+if st.button('Load Sequence'):
+    HASH_KEY = st.secrets["hash_key"]
+    if input_hash == HASH_KEY:
+        st.write("Access granted")
+        # read in spreadsheet
+        if 'gsheed' not in st.session_state:
+            df = mysheet.read_gspread()
+            st.session_state.gsheed = df
+        progress_text = 'Loading...'
+        loading_bar = st.progress(0, text=progress_text)
+        item_limit=3
+        df = st.session_state.gsheed
+        yt.get_channel_video_data(st.session_state["channel_id"], df,loading_bar, progress_text, item_limit)
+        df_videos = get_videos_from_yt(yt)
+        dataset = pd.DataFrame(df_videos)
+        for sng in dataset['Video ID']:
+            subsng = sng[sng.find('>')+1:sng.find('</')]
+            print(subsng)
+            transcript_text, transcript_item_is_generated = get_transcript(subsng)
+            if transcript_item_is_generated:
+                get_punctuated_text(transcript_text)
+            else:
+                get_punctuated_text_to_dict(transcript_text)
+            get_extracted_text(st.session_state.punkt[0])
+    else:
+        st.write("Access denied")

yt_stats.py CHANGED Viewed

@@ -33,7 +33,7 @@ class YTstats:
         #pbar.close()
         return data
-    def get_channel_video_data(self, channel_id, loading_bar, progress_text, item_limit=3):
         "Extract all video information of the channel"
         print('get video data...')
         channel_videos, channel_playlists = self._get_channel_content(channel_id, limit=50)
@@ -61,7 +61,7 @@ class YTstats:
             duration = isodate.parse_duration(channel_videos[video_id]['duration'])
             short_duration = isodate.parse_duration('PT4M')
-            if duration > short_duration:
                 item = item+1
                 step = step +step_size
                 channel_videos_out[video_id] = channel_videos[video_id]

         #pbar.close()
         return data
+    def get_channel_video_data(self, channel_id, df_sheet, loading_bar, progress_text, item_limit=3):
         "Extract all video information of the channel"
         print('get video data...')
         channel_videos, channel_playlists = self._get_channel_content(channel_id, limit=50)
             duration = isodate.parse_duration(channel_videos[video_id]['duration'])
             short_duration = isodate.parse_duration('PT4M')
+            if duration > short_duration and video_id not in list(df_sheet.ID):
                 item = item+1
                 step = step +step_size
                 channel_videos_out[video_id] = channel_videos[video_id]