Spaces:

michaelthwan
/

digest-everything-gpt

Sleeping

App Files Files Community

michaelthwan commited on Jun 24, 2023

Commit

a4aef81

1 Parent(s): 347bbc4

20230624

Browse files

Files changed (6) hide show

README.md +4 -1
digester/chatgpt_service.py +3 -54
digester/gradio_method_service.py +26 -3
digester/gradio_ui_service.py +1 -1
digester/test_youtube_chain.py +10 -3
img/n_things_example.png +0 -0

README.md CHANGED Viewed

@@ -12,6 +12,8 @@ license: mit
 # DigestEverythingGPT
 DigestEverythingGPT provides world-class content summarization/query tool that leverages ChatGPT/LLMs to help users
 quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF
 documents.
@@ -19,6 +21,8 @@ documents.
 The prompt engineering is **chained and tuned** so that is result is of high quality and fast. It is not a simple single
 query and response tool.
 # Live website
 https://huggingface.co/spaces/michaelthwan/digest-everything-gpt
@@ -79,4 +83,3 @@ DigestEverything-GPT is licensed under the MIT License.
 # Acknowledgements
 - chatgpt_academic for gradio code framework

 # DigestEverythingGPT
 DigestEverythingGPT provides world-class content summarization/query tool that leverages ChatGPT/LLMs to help users
 quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF
 documents.
 The prompt engineering is **chained and tuned** so that is result is of high quality and fast. It is not a simple single
 query and response tool.
+Please leave me a star🌟 if you like this project!
 # Live website
 https://huggingface.co/spaces/michaelthwan/digest-everything-gpt
 # Acknowledgements
 - chatgpt_academic for gradio code framework

digester/chatgpt_service.py CHANGED Viewed

@@ -219,65 +219,13 @@ class ChatGPTService:
     @staticmethod
     def single_call_chatgpt_with_handling(source_md, prompt_str: str, prompt_show_user: str, chatbot, api_key, gpt_model, history=[]):
-        """
-        Handling
-        - token exceeding -> split input
-        - timeout -> retry 2 times
-        - other error -> retry 2 times
-        """
-        TIMEOUT_SECONDS, MAX_RETRY = config['openai']['timeout_sec'], config['openai']['max_retry']
-        # When multi-threaded, you need a mutable structure to pass information between different threads
-        # list is the simplest mutable structure, we put gpt output in the first position, the second position to pass the error message
-        mutable_list = [None, '']  # [gpt_output, error_message]
-        # multi-threading worker
-        def mt(prompt_str, history):
-            while True:
-                try:
-                    mutable_list[0] = ChatGPTService.single_rest_call_chatgpt(api_key, prompt_str, gpt_model, history=history)
-                    break
-                except ConnectionAbortedError as token_exceeded_error:
-                    # # Try to calculate the ratio and keep as much text as possible
-                    # print(f'[Local Message] Token exceeded: {token_exceeded_error}.')
-                    # p_ratio, n_exceed = ChatGPTService.get_reduce_token_percent(str(token_exceeded_error))
-                    # if len(history) > 0:
-                    #     history = [his[int(len(his) * p_ratio):] for his in history if his is not None]
-                    # else:
-                    #     prompt_str = prompt_str[:int(len(prompt_str) * p_ratio)]
-                    # mutable_list[1] = f'Warning: text too long will be truncated. Token exceeded：{n_exceed}，Truncation ratio: {(1 - p_ratio):.0%}。'
-                    mutable_list[0] = TOKEN_EXCEED_MSG
-                except TimeoutError as e:
-                    mutable_list[0] = TIMEOUT_MSG
-                    raise TimeoutError
-                except Exception as e:
-                    mutable_list[0] = f'{provide_text_with_css("ERROR", "red")} Exception: {str(e)}.'
-                    raise RuntimeError(f'[ERROR] Exception: {str(e)}.')
-                # TODO retry
-        # Create a new thread to make http requests
-        thread_name = threading.Thread(target=mt, args=(prompt_str, history))
-        thread_name.start()
-        # The original thread is responsible for continuously updating the UI, implementing a timeout countdown, and waiting for the new thread's task to complete
-        cnt = 0
-        while thread_name.is_alive():
-            cnt += 1
-            is_append = False
-            if cnt == 1:
-                is_append = True
-            yield from ChatGPTService.say(prompt_show_user, f"""
-{provide_text_with_css("PROCESSING...", "blue")} {mutable_list[1]}waiting gpt response {cnt}/{TIMEOUT_SECONDS * 2 * (MAX_RETRY + 1)}{''.join(['.'] * (cnt % 4))}
-{mutable_list[0]}
-            """, chatbot, history, 'Normal', source_md, is_append)
-            time.sleep(1)
-        # Get the output of gpt out of the mutable
-        gpt_response = mutable_list[0]
         if 'ERROR' in gpt_response:
             raise Exception
         return gpt_response
     @staticmethod
-    def single_rest_call_chatgpt(api_key, prompt_str: str, gpt_model, history=[], observe_window=None):
         """
         Single call chatgpt only. No handling on multiple call (it should be in upper caller multi_call_chatgpt_with_handling())
         - Support stream=True
@@ -324,6 +272,7 @@ class ChatGPTService:
             if "content" in delta:
                 result += delta["content"]
                 print(delta["content"], end='')
                 if observe_window is not None: observe_window[0] += delta["content"]
             else:
                 raise RuntimeError("Unexpected Json structure: " + delta)

     @staticmethod
     def single_call_chatgpt_with_handling(source_md, prompt_str: str, prompt_show_user: str, chatbot, api_key, gpt_model, history=[]):
+        gpt_response = yield from ChatGPTService.single_rest_call_chatgpt(api_key, prompt_str, gpt_model, chatbot, history=history)
         if 'ERROR' in gpt_response:
             raise Exception
         return gpt_response
     @staticmethod
+    def single_rest_call_chatgpt(api_key, prompt_str: str, gpt_model, chatbot, history=[], observe_window=None):
         """
         Single call chatgpt only. No handling on multiple call (it should be in upper caller multi_call_chatgpt_with_handling())
         - Support stream=True
             if "content" in delta:
                 result += delta["content"]
                 print(delta["content"], end='')
+                yield from ChatGPTService.say(None, result, chatbot, history, "Success", "", is_append=False)
                 if observe_window is not None: observe_window[0] += delta["content"]
             else:
                 raise RuntimeError("Unexpected Json structure: " + delta)

digester/gradio_method_service.py CHANGED Viewed

@@ -245,9 +245,12 @@ Give the video type with JSON format like {"type": "N things"}, and exclude othe
         prompt_suffix="""
 [TASK]
 Convert this into youtube summary.
-Separate for 2-5 minutes chunk, maximum 6 words as a noun for one line.
 Start with the timestamp followed by the summarized text for that chunk.
 Must use language: {language}
 Example format:
 {first_timestamp} - This is the first part
@@ -270,6 +273,7 @@ Example format:
 Summarize the above points under 30 words. Step by step showing points for the main concepts.
 Use markdown format.
 Must use language: {language}
 {task_constraint}
 The format is like:
@@ -302,7 +306,8 @@ Highlights:
 - [Emoji] (content of highlights)
 - [Emoji] (content of highlights)
-For highlight, up to five concise bullet points, less than 15 words for each bullet point. Put different appropriate emoji for each bullet point
 """,
     }
@@ -353,9 +358,18 @@ For highlight, up to five concise bullet points, less than 15 words for each bul
         transcript_with_ts = ""
         for entry in youtube_data.ts_transcript_list:
             transcript_with_ts += f"{int(entry['start'] // 60)}:{int(entry['start'] % 60):02d} {entry['text']}\n"
         prompt = Prompt(cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
                         cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_main.format(transcript_with_ts=transcript_with_ts),
                         cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_suffix.replace("{language}", g_inputs.language_textbox)
                         )
         prompt_show_user = "Generate the timestamped summary"
         response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs, is_timestamp=True)
@@ -368,10 +382,19 @@ For highlight, up to five concise bullet points, less than 15 words for each bul
             task_constraint = cls.FINAL_SUMMARY_TASK_CONSTRAINTS[video_type]
         else:
             task_constraint = ""
         prompt = Prompt(
             cls.FINAL_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
             cls.FINAL_SUMMARY_PROMPT.prompt_main.format(transcript=youtube_data.full_content),
-            cls.FINAL_SUMMARY_PROMPT.prompt_suffix.format(task_constraint=task_constraint, format_constraint=format_constraint, language=g_inputs.language_textbox)
         )
         prompt_show_user = "Generate the final summary"
         response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs)

         prompt_suffix="""
 [TASK]
 Convert this into youtube summary.
+Combine and merge timestamp to for 2-5 minutes chunk. Maximum {word_limit} using noun for one line. Must not exceed the limit
 Start with the timestamp followed by the summarized text for that chunk.
 Must use language: {language}
+Strictly follow the task rules especially for language and character limit
+Maximum {word_limit} using noun for one line. Using noun, not sentence
 Example format:
 {first_timestamp} - This is the first part
 Summarize the above points under 30 words. Step by step showing points for the main concepts.
 Use markdown format.
 Must use language: {language}
+Strictly follow the task rules and use {language} language
 {task_constraint}
 The format is like:
 - [Emoji] (content of highlights)
 - [Emoji] (content of highlights)
+For highlight, up to five concise bullet points, less than {char_limit} for each bullet point. Put different appropriate emoji for each bullet point
+Must use language {language} as output
 """,
     }
         transcript_with_ts = ""
         for entry in youtube_data.ts_transcript_list:
             transcript_with_ts += f"{int(entry['start'] // 60)}:{int(entry['start'] % 60):02d} {entry['text']}\n"
+        def _get_char_limit(language: str):
+            """If Chinese/Japan/Korean, use character limit. Otherwise, use word limit"""
+            if 'zh' in language or language in ["ja-JP", "ko-KR"]:
+                return f"15 {language} characters"
+            else:
+                return "8 words"
         prompt = Prompt(cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
                         cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_main.format(transcript_with_ts=transcript_with_ts),
                         cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_suffix.replace("{language}", g_inputs.language_textbox)
+                        .replace("{word_limit}", _get_char_limit(g_inputs.language_textbox))
                         )
         prompt_show_user = "Generate the timestamped summary"
         response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs, is_timestamp=True)
             task_constraint = cls.FINAL_SUMMARY_TASK_CONSTRAINTS[video_type]
         else:
             task_constraint = ""
+        def _get_char_limit(language):
+            """If Chinese/Japan/Korean, use character limit. Otherwise, use word limit"""
+            if 'zh' in language or language in ["ja-JP", "ko-KR"]:
+                return f"30 {language} characters"
+            else:
+                return "15 words"
         prompt = Prompt(
             cls.FINAL_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
             cls.FINAL_SUMMARY_PROMPT.prompt_main.format(transcript=youtube_data.full_content),
+            cls.FINAL_SUMMARY_PROMPT.prompt_suffix.format(task_constraint=task_constraint,
+                                                          format_constraint=format_constraint.replace("{char_limit}", _get_char_limit(g_inputs.language_textbox)).replace("{language}", g_inputs.language_textbox),
+                                                          language=g_inputs.language_textbox)
         )
         prompt_show_user = "Generate the final summary"
         response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs)

digester/gradio_ui_service.py CHANGED Viewed

@@ -8,7 +8,7 @@ title_html = """
 <p align=\"center\">
 DigestEverythingGPT leverages ChatGPT/LLMs to help users quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF documents.<br>
 The prompt engineering is chained and tuned so that is result is of high quality and fast. It is not a simple single query and response tool.<br>
-Version 20230619 (
 <a href="https://github.com/michaelthwan/digest-everything-gpt"><i class="fa fa-github"></i> Github</a>
 ) (
 <a href="https://huggingface.co/spaces/michaelthwan/digest-everything-gpt"> HFSpace</a>

 <p align=\"center\">
 DigestEverythingGPT leverages ChatGPT/LLMs to help users quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF documents.<br>
 The prompt engineering is chained and tuned so that is result is of high quality and fast. It is not a simple single query and response tool.<br>
+Version 20230624 (
 <a href="https://github.com/michaelthwan/digest-everything-gpt"><i class="fa fa-github"></i> Github</a>
 ) (
 <a href="https://huggingface.co/spaces/michaelthwan/digest-everything-gpt"> HFSpace</a>

digester/test_youtube_chain.py CHANGED Viewed

@@ -49,6 +49,11 @@ class VideoExample:
         video_id = "lF_KWLfQFs8"
         return VideoExample.get_youtube_data("", video_id)
 class YoutubeTestChain:
     def __init__(self, api_key: str, gpt_model):
@@ -87,9 +92,11 @@ if __name__ == '__main__':
     GPT_MODEL = "gpt-3.5-turbo-16k"
     assert api_key
-    gradio_inputs = GradioInputs(apikey_textbox=api_key, gpt_model_textbox=GPT_MODEL, source_textbox="", source_target_textbox="", qa_textbox="", language_textbox="en-US", chatbot=[], history=[])
-    youtube_data: YoutubeData = VideoExample.get_nthings_8_habits()
     youtube_test_chain = YoutubeTestChain(api_key, GPT_MODEL)
     # youtube_test_chain.test_youtube_classifier(gradio_inputs, youtube_data)

         video_id = "lF_KWLfQFs8"
         return VideoExample.get_youtube_data("", video_id)
+    @staticmethod
+    def get_wealth_CN_long_vid():
+        video_id = "6mVX78_nq0A"
+        return VideoExample.get_youtube_data("", video_id)
 class YoutubeTestChain:
     def __init__(self, api_key: str, gpt_model):
     GPT_MODEL = "gpt-3.5-turbo-16k"
     assert api_key
+    language = "zh-CN"
+    gradio_inputs = GradioInputs(apikey_textbox=api_key, gpt_model_textbox=GPT_MODEL,
+                                 source_textbox="", source_target_textbox="",
+                                 qa_textbox="", language_textbox=language, chatbot=[], history=[])
+    youtube_data: YoutubeData = VideoExample.get_nthings_10_autogpt()
     youtube_test_chain = YoutubeTestChain(api_key, GPT_MODEL)
     # youtube_test_chain.test_youtube_classifier(gradio_inputs, youtube_data)

img/n_things_example.png CHANGED Viewed