michaelthwan commited on
Commit
a4aef81
·
1 Parent(s): 347bbc4
README.md CHANGED
@@ -12,6 +12,8 @@ license: mit
12
 
13
  # DigestEverythingGPT
14
 
 
 
15
  DigestEverythingGPT provides world-class content summarization/query tool that leverages ChatGPT/LLMs to help users
16
  quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF
17
  documents.
@@ -19,6 +21,8 @@ documents.
19
  The prompt engineering is **chained and tuned** so that is result is of high quality and fast. It is not a simple single
20
  query and response tool.
21
 
 
 
22
  # Live website
23
 
24
  https://huggingface.co/spaces/michaelthwan/digest-everything-gpt
@@ -79,4 +83,3 @@ DigestEverything-GPT is licensed under the MIT License.
79
  # Acknowledgements
80
 
81
  - chatgpt_academic for gradio code framework
82
-
 
12
 
13
  # DigestEverythingGPT
14
 
15
+
16
+
17
  DigestEverythingGPT provides world-class content summarization/query tool that leverages ChatGPT/LLMs to help users
18
  quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF
19
  documents.
 
21
  The prompt engineering is **chained and tuned** so that is result is of high quality and fast. It is not a simple single
22
  query and response tool.
23
 
24
+ Please leave me a star🌟 if you like this project!
25
+
26
  # Live website
27
 
28
  https://huggingface.co/spaces/michaelthwan/digest-everything-gpt
 
83
  # Acknowledgements
84
 
85
  - chatgpt_academic for gradio code framework
 
digester/chatgpt_service.py CHANGED
@@ -219,65 +219,13 @@ class ChatGPTService:
219
 
220
  @staticmethod
221
  def single_call_chatgpt_with_handling(source_md, prompt_str: str, prompt_show_user: str, chatbot, api_key, gpt_model, history=[]):
222
- """
223
- Handling
224
- - token exceeding -> split input
225
- - timeout -> retry 2 times
226
- - other error -> retry 2 times
227
- """
228
-
229
- TIMEOUT_SECONDS, MAX_RETRY = config['openai']['timeout_sec'], config['openai']['max_retry']
230
- # When multi-threaded, you need a mutable structure to pass information between different threads
231
- # list is the simplest mutable structure, we put gpt output in the first position, the second position to pass the error message
232
- mutable_list = [None, ''] # [gpt_output, error_message]
233
-
234
- # multi-threading worker
235
- def mt(prompt_str, history):
236
- while True:
237
- try:
238
- mutable_list[0] = ChatGPTService.single_rest_call_chatgpt(api_key, prompt_str, gpt_model, history=history)
239
- break
240
- except ConnectionAbortedError as token_exceeded_error:
241
- # # Try to calculate the ratio and keep as much text as possible
242
- # print(f'[Local Message] Token exceeded: {token_exceeded_error}.')
243
- # p_ratio, n_exceed = ChatGPTService.get_reduce_token_percent(str(token_exceeded_error))
244
- # if len(history) > 0:
245
- # history = [his[int(len(his) * p_ratio):] for his in history if his is not None]
246
- # else:
247
- # prompt_str = prompt_str[:int(len(prompt_str) * p_ratio)]
248
- # mutable_list[1] = f'Warning: text too long will be truncated. Token exceeded:{n_exceed},Truncation ratio: {(1 - p_ratio):.0%}。'
249
- mutable_list[0] = TOKEN_EXCEED_MSG
250
- except TimeoutError as e:
251
- mutable_list[0] = TIMEOUT_MSG
252
- raise TimeoutError
253
- except Exception as e:
254
- mutable_list[0] = f'{provide_text_with_css("ERROR", "red")} Exception: {str(e)}.'
255
- raise RuntimeError(f'[ERROR] Exception: {str(e)}.')
256
- # TODO retry
257
-
258
- # Create a new thread to make http requests
259
- thread_name = threading.Thread(target=mt, args=(prompt_str, history))
260
- thread_name.start()
261
- # The original thread is responsible for continuously updating the UI, implementing a timeout countdown, and waiting for the new thread's task to complete
262
- cnt = 0
263
- while thread_name.is_alive():
264
- cnt += 1
265
- is_append = False
266
- if cnt == 1:
267
- is_append = True
268
- yield from ChatGPTService.say(prompt_show_user, f"""
269
- {provide_text_with_css("PROCESSING...", "blue")} {mutable_list[1]}waiting gpt response {cnt}/{TIMEOUT_SECONDS * 2 * (MAX_RETRY + 1)}{''.join(['.'] * (cnt % 4))}
270
- {mutable_list[0]}
271
- """, chatbot, history, 'Normal', source_md, is_append)
272
- time.sleep(1)
273
- # Get the output of gpt out of the mutable
274
- gpt_response = mutable_list[0]
275
  if 'ERROR' in gpt_response:
276
  raise Exception
277
  return gpt_response
278
 
279
  @staticmethod
280
- def single_rest_call_chatgpt(api_key, prompt_str: str, gpt_model, history=[], observe_window=None):
281
  """
282
  Single call chatgpt only. No handling on multiple call (it should be in upper caller multi_call_chatgpt_with_handling())
283
  - Support stream=True
@@ -324,6 +272,7 @@ class ChatGPTService:
324
  if "content" in delta:
325
  result += delta["content"]
326
  print(delta["content"], end='')
 
327
  if observe_window is not None: observe_window[0] += delta["content"]
328
  else:
329
  raise RuntimeError("Unexpected Json structure: " + delta)
 
219
 
220
  @staticmethod
221
  def single_call_chatgpt_with_handling(source_md, prompt_str: str, prompt_show_user: str, chatbot, api_key, gpt_model, history=[]):
222
+ gpt_response = yield from ChatGPTService.single_rest_call_chatgpt(api_key, prompt_str, gpt_model, chatbot, history=history)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  if 'ERROR' in gpt_response:
224
  raise Exception
225
  return gpt_response
226
 
227
  @staticmethod
228
+ def single_rest_call_chatgpt(api_key, prompt_str: str, gpt_model, chatbot, history=[], observe_window=None):
229
  """
230
  Single call chatgpt only. No handling on multiple call (it should be in upper caller multi_call_chatgpt_with_handling())
231
  - Support stream=True
 
272
  if "content" in delta:
273
  result += delta["content"]
274
  print(delta["content"], end='')
275
+ yield from ChatGPTService.say(None, result, chatbot, history, "Success", "", is_append=False)
276
  if observe_window is not None: observe_window[0] += delta["content"]
277
  else:
278
  raise RuntimeError("Unexpected Json structure: " + delta)
digester/gradio_method_service.py CHANGED
@@ -245,9 +245,12 @@ Give the video type with JSON format like {"type": "N things"}, and exclude othe
245
  prompt_suffix="""
246
  [TASK]
247
  Convert this into youtube summary.
248
- Separate for 2-5 minutes chunk, maximum 6 words as a noun for one line.
249
  Start with the timestamp followed by the summarized text for that chunk.
250
  Must use language: {language}
 
 
 
251
 
252
  Example format:
253
  {first_timestamp} - This is the first part
@@ -270,6 +273,7 @@ Example format:
270
  Summarize the above points under 30 words. Step by step showing points for the main concepts.
271
  Use markdown format.
272
  Must use language: {language}
 
273
  {task_constraint}
274
 
275
  The format is like:
@@ -302,7 +306,8 @@ Highlights:
302
  - [Emoji] (content of highlights)
303
  - [Emoji] (content of highlights)
304
 
305
- For highlight, up to five concise bullet points, less than 15 words for each bullet point. Put different appropriate emoji for each bullet point
 
306
  """,
307
  }
308
 
@@ -353,9 +358,18 @@ For highlight, up to five concise bullet points, less than 15 words for each bul
353
  transcript_with_ts = ""
354
  for entry in youtube_data.ts_transcript_list:
355
  transcript_with_ts += f"{int(entry['start'] // 60)}:{int(entry['start'] % 60):02d} {entry['text']}\n"
 
 
 
 
 
 
 
 
356
  prompt = Prompt(cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
357
  cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_main.format(transcript_with_ts=transcript_with_ts),
358
  cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_suffix.replace("{language}", g_inputs.language_textbox)
 
359
  )
360
  prompt_show_user = "Generate the timestamped summary"
361
  response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs, is_timestamp=True)
@@ -368,10 +382,19 @@ For highlight, up to five concise bullet points, less than 15 words for each bul
368
  task_constraint = cls.FINAL_SUMMARY_TASK_CONSTRAINTS[video_type]
369
  else:
370
  task_constraint = ""
 
 
 
 
 
 
 
371
  prompt = Prompt(
372
  cls.FINAL_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
373
  cls.FINAL_SUMMARY_PROMPT.prompt_main.format(transcript=youtube_data.full_content),
374
- cls.FINAL_SUMMARY_PROMPT.prompt_suffix.format(task_constraint=task_constraint, format_constraint=format_constraint, language=g_inputs.language_textbox)
 
 
375
  )
376
  prompt_show_user = "Generate the final summary"
377
  response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs)
 
245
  prompt_suffix="""
246
  [TASK]
247
  Convert this into youtube summary.
248
+ Combine and merge timestamp to for 2-5 minutes chunk. Maximum {word_limit} using noun for one line. Must not exceed the limit
249
  Start with the timestamp followed by the summarized text for that chunk.
250
  Must use language: {language}
251
+ Strictly follow the task rules especially for language and character limit
252
+
253
+ Maximum {word_limit} using noun for one line. Using noun, not sentence
254
 
255
  Example format:
256
  {first_timestamp} - This is the first part
 
273
  Summarize the above points under 30 words. Step by step showing points for the main concepts.
274
  Use markdown format.
275
  Must use language: {language}
276
+ Strictly follow the task rules and use {language} language
277
  {task_constraint}
278
 
279
  The format is like:
 
306
  - [Emoji] (content of highlights)
307
  - [Emoji] (content of highlights)
308
 
309
+ For highlight, up to five concise bullet points, less than {char_limit} for each bullet point. Put different appropriate emoji for each bullet point
310
+ Must use language {language} as output
311
  """,
312
  }
313
 
 
358
  transcript_with_ts = ""
359
  for entry in youtube_data.ts_transcript_list:
360
  transcript_with_ts += f"{int(entry['start'] // 60)}:{int(entry['start'] % 60):02d} {entry['text']}\n"
361
+
362
+ def _get_char_limit(language: str):
363
+ """If Chinese/Japan/Korean, use character limit. Otherwise, use word limit"""
364
+ if 'zh' in language or language in ["ja-JP", "ko-KR"]:
365
+ return f"15 {language} characters"
366
+ else:
367
+ return "8 words"
368
+
369
  prompt = Prompt(cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
370
  cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_main.format(transcript_with_ts=transcript_with_ts),
371
  cls.TIMESTAMPED_SUMMARY_PROMPT.prompt_suffix.replace("{language}", g_inputs.language_textbox)
372
+ .replace("{word_limit}", _get_char_limit(g_inputs.language_textbox))
373
  )
374
  prompt_show_user = "Generate the timestamped summary"
375
  response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs, is_timestamp=True)
 
382
  task_constraint = cls.FINAL_SUMMARY_TASK_CONSTRAINTS[video_type]
383
  else:
384
  task_constraint = ""
385
+ def _get_char_limit(language):
386
+ """If Chinese/Japan/Korean, use character limit. Otherwise, use word limit"""
387
+ if 'zh' in language or language in ["ja-JP", "ko-KR"]:
388
+ return f"30 {language} characters"
389
+ else:
390
+ return "15 words"
391
+
392
  prompt = Prompt(
393
  cls.FINAL_SUMMARY_PROMPT.prompt_prefix.format(title=youtube_data.title),
394
  cls.FINAL_SUMMARY_PROMPT.prompt_main.format(transcript=youtube_data.full_content),
395
+ cls.FINAL_SUMMARY_PROMPT.prompt_suffix.format(task_constraint=task_constraint,
396
+ format_constraint=format_constraint.replace("{char_limit}", _get_char_limit(g_inputs.language_textbox)).replace("{language}", g_inputs.language_textbox),
397
+ language=g_inputs.language_textbox)
398
  )
399
  prompt_show_user = "Generate the final summary"
400
  response, len_prompts = yield from ChatGPTService.trigger_callgpt_pipeline(prompt, prompt_show_user, g_inputs)
digester/gradio_ui_service.py CHANGED
@@ -8,7 +8,7 @@ title_html = """
8
  <p align=\"center\">
9
  DigestEverythingGPT leverages ChatGPT/LLMs to help users quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF documents.<br>
10
  The prompt engineering is chained and tuned so that is result is of high quality and fast. It is not a simple single query and response tool.<br>
11
- Version 20230619 (
12
  <a href="https://github.com/michaelthwan/digest-everything-gpt"><i class="fa fa-github"></i> Github</a>
13
  ) (
14
  <a href="https://huggingface.co/spaces/michaelthwan/digest-everything-gpt"> HFSpace</a>
 
8
  <p align=\"center\">
9
  DigestEverythingGPT leverages ChatGPT/LLMs to help users quickly understand essential information from various forms of content, such as podcasts, YouTube videos, and PDF documents.<br>
10
  The prompt engineering is chained and tuned so that is result is of high quality and fast. It is not a simple single query and response tool.<br>
11
+ Version 20230624 (
12
  <a href="https://github.com/michaelthwan/digest-everything-gpt"><i class="fa fa-github"></i> Github</a>
13
  ) (
14
  <a href="https://huggingface.co/spaces/michaelthwan/digest-everything-gpt"> HFSpace</a>
digester/test_youtube_chain.py CHANGED
@@ -49,6 +49,11 @@ class VideoExample:
49
  video_id = "lF_KWLfQFs8"
50
  return VideoExample.get_youtube_data("", video_id)
51
 
 
 
 
 
 
52
 
53
  class YoutubeTestChain:
54
  def __init__(self, api_key: str, gpt_model):
@@ -87,9 +92,11 @@ if __name__ == '__main__':
87
  GPT_MODEL = "gpt-3.5-turbo-16k"
88
  assert api_key
89
 
90
-
91
- gradio_inputs = GradioInputs(apikey_textbox=api_key, gpt_model_textbox=GPT_MODEL, source_textbox="", source_target_textbox="", qa_textbox="", language_textbox="en-US", chatbot=[], history=[])
92
- youtube_data: YoutubeData = VideoExample.get_nthings_8_habits()
 
 
93
 
94
  youtube_test_chain = YoutubeTestChain(api_key, GPT_MODEL)
95
  # youtube_test_chain.test_youtube_classifier(gradio_inputs, youtube_data)
 
49
  video_id = "lF_KWLfQFs8"
50
  return VideoExample.get_youtube_data("", video_id)
51
 
52
+ @staticmethod
53
+ def get_wealth_CN_long_vid():
54
+ video_id = "6mVX78_nq0A"
55
+ return VideoExample.get_youtube_data("", video_id)
56
+
57
 
58
  class YoutubeTestChain:
59
  def __init__(self, api_key: str, gpt_model):
 
92
  GPT_MODEL = "gpt-3.5-turbo-16k"
93
  assert api_key
94
 
95
+ language = "zh-CN"
96
+ gradio_inputs = GradioInputs(apikey_textbox=api_key, gpt_model_textbox=GPT_MODEL,
97
+ source_textbox="", source_target_textbox="",
98
+ qa_textbox="", language_textbox=language, chatbot=[], history=[])
99
+ youtube_data: YoutubeData = VideoExample.get_nthings_10_autogpt()
100
 
101
  youtube_test_chain = YoutubeTestChain(api_key, GPT_MODEL)
102
  # youtube_test_chain.test_youtube_classifier(gradio_inputs, youtube_data)
img/n_things_example.png CHANGED