Spaces:
Sleeping
Sleeping
michaelthwan
commited on
Commit
·
a4aef81
1
Parent(s):
347bbc4
20230624
Browse files- README.md +4 -1
- digester/chatgpt_service.py +3 -54
- digester/gradio_method_service.py +26 -3
- digester/gradio_ui_service.py +1 -1
- digester/test_youtube_chain.py +10 -3
- img/n_things_example.png +0 -0
README.md
CHANGED
@@ -12,6 +12,8 @@ license: mit
|
|
12 |
|
13 |
# DigestEverythingGPT
|
14 |
|
|
|
|
|
15 |
DigestEverythingGPT provides world-class content summarization/query tool that leverages ChatGPT/LLMs to help users
|
16 |
quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF
|
17 |
documents.
|
@@ -19,6 +21,8 @@ documents.
|
|
19 |
The prompt engineering is **chained and tuned** so that is result is of high quality and fast. It is not a simple single
|
20 |
query and response tool.
|
21 |
|
|
|
|
|
22 |
# Live website
|
23 |
|
24 |
https://huggingface.co/spaces/michaelthwan/digest-everything-gpt
|
@@ -79,4 +83,3 @@ DigestEverything-GPT is licensed under the MIT License.
|
|
79 |
# Acknowledgements
|
80 |
|
81 |
- chatgpt_academic for gradio code framework
|
82 |
-
|
|
|
12 |
|
13 |
# DigestEverythingGPT
|
14 |
|
15 |
+
|
16 |
+
|
17 |
DigestEverythingGPT provides world-class content summarization/query tool that leverages ChatGPT/LLMs to help users
|
18 |
quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF
|
19 |
documents.
|
|
|
21 |
The prompt engineering is **chained and tuned** so that is result is of high quality and fast. It is not a simple single
|
22 |
query and response tool.
|
23 |
|
24 |
+
Please leave me a star🌟 if you like this project!
|
25 |
+
|
26 |
# Live website
|
27 |
|
28 |
https://huggingface.co/spaces/michaelthwan/digest-everything-gpt
|
|
|
83 |
# Acknowledgements
|
84 |
|
85 |
- chatgpt_academic for gradio code framework
|
|
digester/chatgpt_service.py
CHANGED
@@ -219,65 +219,13 @@ class ChatGPTService:
|
|
219 |
|
220 |
@staticmethod
|
221 |
def single_call_chatgpt_with_handling(source_md, prompt_str: str, prompt_show_user: str, chatbot, api_key, gpt_model, history=[]):
|
222 |
-
|
223 |
-
Handling
|
224 |
-
- token exceeding -> split input
|
225 |
-
- timeout -> retry 2 times
|
226 |
-
- other error -> retry 2 times
|
227 |
-
"""
|
228 |
-
|
229 |
-
TIMEOUT_SECONDS, MAX_RETRY = config['openai']['timeout_sec'], config['openai']['max_retry']
|
230 |
-
# When multi-threaded, you need a mutable structure to pass information between different threads
|
231 |
-
# list is the simplest mutable structure, we put gpt output in the first position, the second position to pass the error message
|
232 |
-
mutable_list = [None, ''] # [gpt_output, error_message]
|
233 |
-
|
234 |
-
# multi-threading worker
|
235 |
-
def mt(prompt_str, history):
|
236 |
-
while True:
|
237 |
-
try:
|
238 |
-
mutable_list[0] = ChatGPTService.single_rest_call_chatgpt(api_key, prompt_str, gpt_model, history=history)
|
239 |
-
break
|
240 |
-
except ConnectionAbortedError as token_exceeded_error:
|
241 |
-
# # Try to calculate the ratio and keep as much text as possible
|
242 |
-
# print(f'[Local Message] Token exceeded: {token_exceeded_error}.')
|
243 |
-
# p_ratio, n_exceed = ChatGPTService.get_reduce_token_percent(str(token_exceeded_error))
|
244 |
-
# if len(history) > 0:
|
245 |
-
# history = [his[int(len(his) * p_ratio):] for his in history if his is not None]
|
246 |
-
# else:
|
247 |
-
# prompt_str = prompt_str[:int(len(prompt_str) * p_ratio)]
|
248 |
-
# mutable_list[1] = f'Warning: text too long will be truncated. Token exceeded:{n_exceed},Truncation ratio: {(1 - p_ratio):.0%}。'
|
249 |
-
mutable_list[0] = TOKEN_EXCEED_MSG
|
250 |
-
except TimeoutError as e:
|
251 |
-
mutable_list[0] = TIMEOUT_MSG
|
252 |
-
raise TimeoutError
|
253 |
-
except Exception as e:
|
254 |
-
mutable_list[0] = f'{provide_text_with_css("ERROR", "red")} Exception: {str(e)}.'
|
255 |
-
raise RuntimeError(f'[ERROR] Exception: {str(e)}.')
|
256 |
-
# TODO retry
|
257 |
-
|
258 |
-
# Create a new thread to make http requests
|
259 |
-
thread_name = threading.Thread(target=mt, args=(prompt_str, history))
|
260 |
-
thread_name.start()
|
261 |
-
# The original thread is responsible for continuously updating the UI, implementing a timeout countdown, and waiting for the new thread's task to complete
|
262 |
-
cnt = 0
|
263 |
-
while thread_name.is_alive():
|
264 |
-
cnt += 1
|
265 |
-
is_append = False
|
266 |
-
if cnt == 1:
|
267 |
-
is_append = True
|
268 |
-
yield from ChatGPTService.say(prompt_show_user, f"""
|
269 |
-
{provide_text_with_css("PROCESSING...", "blue")} {mutable_list[1]}waiting gpt response {cnt}/{TIMEOUT_SECONDS * 2 * (MAX_RETRY + 1)}{''.join(['.'] * (cnt % 4))}
|
270 |
-
{mutable_list[0]}
|
271 |
-
""", chatbot, history, 'Normal', source_md, is_append)
|
272 |
-
time.sleep(1)
|
273 |
-
# Get the output of gpt out of the mutable
|
274 |
-
gpt_response = mutable_list[0]
|
275 |
if 'ERROR' in gpt_response:
|
276 |
raise Exception
|
277 |
return gpt_response
|
278 |
|
279 |
@staticmethod
|
280 |
-
def single_rest_call_chatgpt(api_key, prompt_str: str, gpt_model, history=[], observe_window=None):
|
281 |
"""
|
282 |
Single call chatgpt only. No handling on multiple call (it should be in upper caller multi_call_chatgpt_with_handling())
|
283 |
- Support stream=True
|
@@ -324,6 +272,7 @@ class ChatGPTService:
|
|
324 |
if "content" in delta:
|
325 |
result += delta["content"]
|
326 |
print(delta["content"], end='')
|
|
|
327 |
if observe_window is not None: observe_window[0] += delta["content"]
|
328 |
else:
|
329 |
raise RuntimeError("Unexpected Json structure: " + delta)
|
|
|
219 |
|
220 |
@staticmethod
|
221 |
def single_call_chatgpt_with_handling(source_md, prompt_str: str, prompt_show_user: str, chatbot, api_key, gpt_model, history=[]):
|
222 |
+
gpt_response = yield from ChatGPTService.single_rest_call_chatgpt(api_key, prompt_str, gpt_model, chatbot, history=history)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
223 |
if 'ERROR' in gpt_response:
|
224 |
raise Exception
|
225 |
return gpt_response
|
226 |
|
227 |
@staticmethod
|
228 |
+
def single_rest_call_chatgpt(api_key, prompt_str: str, gpt_model, chatbot, history=[], observe_window=None):
|
229 |
"""
|
230 |
Single call chatgpt only. No handling on multiple call (it should be in upper caller multi_call_chatgpt_with_handling())
|
231 |
- Support stream=True
|
|
|
272 |
if "content" in delta:
|
273 |
result += delta["content"]
|
274 |
print(delta["content"], end='')
|
275 |
+
yield from ChatGPTService.say(None, result, chatbot, history, "Success", "", is_append=False)
|
276 |
if observe_window is not None: observe_window[0] += delta["content"]
|
277 |
else:
|
278 |
raise RuntimeError("Unexpected Json structure: " + delta)
|
digester/gradio_method_service.py
CHANGED
@@ -245,9 +245,12 @@ Give the video type with JSON format like {"type": "N things"}, and exclude othe
|
|
245 |
prompt_suffix="""
|
246 |
[TASK]
|
247 |
Convert this into youtube summary.
|
248 |
-
|
249 |
Start with the timestamp followed by the summarized text for that chunk.
|
250 |
Must use language: {language}
|
|
|
|
|
|
|
251 |
|
252 |
Example format:
|
253 |
{first_timestamp} - This is the first part
|
@@ -270,6 +273,7 @@ Example format:
|
|
270 |
Summarize the above points under 30 words. Step by step showing points for the main concepts.
|
271 |
Use markdown format.
|
272 |
Must use language: {language}
|
|
|
273 |
{task_constraint}
|
274 |
|
275 |
The format is like:
|
@@ -302,7 +306,8 @@ Highlights:
|
|
302 |
- [Emoji] (content of highlights)
|
303 |
- [Emoji] (content of highlights)
|
304 |
|
305 |
-
For highlight, up to five concise bullet points, less than
|
|
|
306 |
""",
|
307 |
}
|
308 |
|
@@ -353,9 +358,18 @@ For highlight, up to five concise bullet points, less than 15 words for each bul
|
|
353 |
transcript_with_ts = ""
|
354 |
for entry in youtube_data.ts_transcript_list:
|
355 |
transcript_with_ts += f"{int(entry['start'] // 60)}:{int(entry['start'] % 60):02d} {entry['text']}\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
prompt = Prompt(cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
|
357 |
cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_main.format(transcript_with_ts=transcript_with_ts),
|
358 |
cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_suffix.replace("{language}", g_inputs.language_textbox)
|
|
|
359 |
)
|
360 |
prompt_show_user = "Generate the timestamped summary"
|
361 |
response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs, is_timestamp=True)
|
@@ -368,10 +382,19 @@ For highlight, up to five concise bullet points, less than 15 words for each bul
|
|
368 |
task_constraint = cls.FINAL_SUMMARY_TASK_CONSTRAINTS[video_type]
|
369 |
else:
|
370 |
task_constraint = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
prompt = Prompt(
|
372 |
cls.FINAL_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
|
373 |
cls.FINAL_SUMMARY_PROMPT.prompt_main.format(transcript=youtube_data.full_content),
|
374 |
-
cls.FINAL_SUMMARY_PROMPT.prompt_suffix.format(task_constraint=task_constraint,
|
|
|
|
|
375 |
)
|
376 |
prompt_show_user = "Generate the final summary"
|
377 |
response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs)
|
|
|
245 |
prompt_suffix="""
|
246 |
[TASK]
|
247 |
Convert this into youtube summary.
|
248 |
+
Combine and merge timestamp to for 2-5 minutes chunk. Maximum {word_limit} using noun for one line. Must not exceed the limit
|
249 |
Start with the timestamp followed by the summarized text for that chunk.
|
250 |
Must use language: {language}
|
251 |
+
Strictly follow the task rules especially for language and character limit
|
252 |
+
|
253 |
+
Maximum {word_limit} using noun for one line. Using noun, not sentence
|
254 |
|
255 |
Example format:
|
256 |
{first_timestamp} - This is the first part
|
|
|
273 |
Summarize the above points under 30 words. Step by step showing points for the main concepts.
|
274 |
Use markdown format.
|
275 |
Must use language: {language}
|
276 |
+
Strictly follow the task rules and use {language} language
|
277 |
{task_constraint}
|
278 |
|
279 |
The format is like:
|
|
|
306 |
- [Emoji] (content of highlights)
|
307 |
- [Emoji] (content of highlights)
|
308 |
|
309 |
+
For highlight, up to five concise bullet points, less than {char_limit} for each bullet point. Put different appropriate emoji for each bullet point
|
310 |
+
Must use language {language} as output
|
311 |
""",
|
312 |
}
|
313 |
|
|
|
358 |
transcript_with_ts = ""
|
359 |
for entry in youtube_data.ts_transcript_list:
|
360 |
transcript_with_ts += f"{int(entry['start'] // 60)}:{int(entry['start'] % 60):02d} {entry['text']}\n"
|
361 |
+
|
362 |
+
def _get_char_limit(language: str):
|
363 |
+
"""If Chinese/Japan/Korean, use character limit. Otherwise, use word limit"""
|
364 |
+
if 'zh' in language or language in ["ja-JP", "ko-KR"]:
|
365 |
+
return f"15 {language} characters"
|
366 |
+
else:
|
367 |
+
return "8 words"
|
368 |
+
|
369 |
prompt = Prompt(cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
|
370 |
cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_main.format(transcript_with_ts=transcript_with_ts),
|
371 |
cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_suffix.replace("{language}", g_inputs.language_textbox)
|
372 |
+
.replace("{word_limit}", _get_char_limit(g_inputs.language_textbox))
|
373 |
)
|
374 |
prompt_show_user = "Generate the timestamped summary"
|
375 |
response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs, is_timestamp=True)
|
|
|
382 |
task_constraint = cls.FINAL_SUMMARY_TASK_CONSTRAINTS[video_type]
|
383 |
else:
|
384 |
task_constraint = ""
|
385 |
+
def _get_char_limit(language):
|
386 |
+
"""If Chinese/Japan/Korean, use character limit. Otherwise, use word limit"""
|
387 |
+
if 'zh' in language or language in ["ja-JP", "ko-KR"]:
|
388 |
+
return f"30 {language} characters"
|
389 |
+
else:
|
390 |
+
return "15 words"
|
391 |
+
|
392 |
prompt = Prompt(
|
393 |
cls.FINAL_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
|
394 |
cls.FINAL_SUMMARY_PROMPT.prompt_main.format(transcript=youtube_data.full_content),
|
395 |
+
cls.FINAL_SUMMARY_PROMPT.prompt_suffix.format(task_constraint=task_constraint,
|
396 |
+
format_constraint=format_constraint.replace("{char_limit}", _get_char_limit(g_inputs.language_textbox)).replace("{language}", g_inputs.language_textbox),
|
397 |
+
language=g_inputs.language_textbox)
|
398 |
)
|
399 |
prompt_show_user = "Generate the final summary"
|
400 |
response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs)
|
digester/gradio_ui_service.py
CHANGED
@@ -8,7 +8,7 @@ title_html = """
|
|
8 |
<p align=\"center\">
|
9 |
DigestEverythingGPT leverages ChatGPT/LLMs to help users quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF documents.<br>
|
10 |
The prompt engineering is chained and tuned so that is result is of high quality and fast. It is not a simple single query and response tool.<br>
|
11 |
-
Version
|
12 |
<a href="https://github.com/michaelthwan/digest-everything-gpt"><i class="fa fa-github"></i> Github</a>
|
13 |
) (
|
14 |
<a href="https://huggingface.co/spaces/michaelthwan/digest-everything-gpt"> HFSpace</a>
|
|
|
8 |
<p align=\"center\">
|
9 |
DigestEverythingGPT leverages ChatGPT/LLMs to help users quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF documents.<br>
|
10 |
The prompt engineering is chained and tuned so that is result is of high quality and fast. It is not a simple single query and response tool.<br>
|
11 |
+
Version 20230624 (
|
12 |
<a href="https://github.com/michaelthwan/digest-everything-gpt"><i class="fa fa-github"></i> Github</a>
|
13 |
) (
|
14 |
<a href="https://huggingface.co/spaces/michaelthwan/digest-everything-gpt"> HFSpace</a>
|
digester/test_youtube_chain.py
CHANGED
@@ -49,6 +49,11 @@ class VideoExample:
|
|
49 |
video_id = "lF_KWLfQFs8"
|
50 |
return VideoExample.get_youtube_data("", video_id)
|
51 |
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
class YoutubeTestChain:
|
54 |
def __init__(self, api_key: str, gpt_model):
|
@@ -87,9 +92,11 @@ if __name__ == '__main__':
|
|
87 |
GPT_MODEL = "gpt-3.5-turbo-16k"
|
88 |
assert api_key
|
89 |
|
90 |
-
|
91 |
-
gradio_inputs = GradioInputs(apikey_textbox=api_key, gpt_model_textbox=GPT_MODEL,
|
92 |
-
|
|
|
|
|
93 |
|
94 |
youtube_test_chain = YoutubeTestChain(api_key, GPT_MODEL)
|
95 |
# youtube_test_chain.test_youtube_classifier(gradio_inputs, youtube_data)
|
|
|
49 |
video_id = "lF_KWLfQFs8"
|
50 |
return VideoExample.get_youtube_data("", video_id)
|
51 |
|
52 |
+
@staticmethod
|
53 |
+
def get_wealth_CN_long_vid():
|
54 |
+
video_id = "6mVX78_nq0A"
|
55 |
+
return VideoExample.get_youtube_data("", video_id)
|
56 |
+
|
57 |
|
58 |
class YoutubeTestChain:
|
59 |
def __init__(self, api_key: str, gpt_model):
|
|
|
92 |
GPT_MODEL = "gpt-3.5-turbo-16k"
|
93 |
assert api_key
|
94 |
|
95 |
+
language = "zh-CN"
|
96 |
+
gradio_inputs = GradioInputs(apikey_textbox=api_key, gpt_model_textbox=GPT_MODEL,
|
97 |
+
source_textbox="", source_target_textbox="",
|
98 |
+
qa_textbox="", language_textbox=language, chatbot=[], history=[])
|
99 |
+
youtube_data: YoutubeData = VideoExample.get_nthings_10_autogpt()
|
100 |
|
101 |
youtube_test_chain = YoutubeTestChain(api_key, GPT_MODEL)
|
102 |
# youtube_test_chain.test_youtube_classifier(gradio_inputs, youtube_data)
|
img/n_things_example.png
CHANGED