3v324v23 commited on
Commit
85d85d8
·
1 Parent(s): dcaa7a1
crazy_functions/crazy_utils.py CHANGED
@@ -37,6 +37,7 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
37
  lines = txt_tocut.split('\n')
38
  estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
39
  estimated_line_cut = int(estimated_line_cut)
 
40
  for cnt in reversed(range(estimated_line_cut)):
41
  if must_break_at_empty_line:
42
  if lines[cnt] != "": continue
@@ -45,7 +46,7 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
45
  post = "\n".join(lines[cnt:])
46
  if get_token_fn(prev) < limit: break
47
  if cnt == 0:
48
- print('what the fuck ?')
49
  raise RuntimeError("存在一行极长的文本!")
50
  # print(len(post))
51
  # 列表递归接龙
@@ -55,4 +56,10 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
55
  try:
56
  return cut(txt, must_break_at_empty_line=True)
57
  except RuntimeError:
58
- return cut(txt, must_break_at_empty_line=False)
 
 
 
 
 
 
 
37
  lines = txt_tocut.split('\n')
38
  estimated_line_cut = limit / get_token_fn(txt_tocut) * len(lines)
39
  estimated_line_cut = int(estimated_line_cut)
40
+ cnt = 0
41
  for cnt in reversed(range(estimated_line_cut)):
42
  if must_break_at_empty_line:
43
  if lines[cnt] != "": continue
 
46
  post = "\n".join(lines[cnt:])
47
  if get_token_fn(prev) < limit: break
48
  if cnt == 0:
49
+ # print('what the fuck ? 存在一行极长的文本!')
50
  raise RuntimeError("存在一行极长的文本!")
51
  # print(len(post))
52
  # 列表递归接龙
 
56
  try:
57
  return cut(txt, must_break_at_empty_line=True)
58
  except RuntimeError:
59
+ try:
60
+ return cut(txt, must_break_at_empty_line=False)
61
+ except RuntimeError:
62
+ # 这个中文的句号是故意的,作为一个标识而存在
63
+ res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False)
64
+ return [r.replace('。\n', '.') for r in res]
65
+
crazy_functions/批量翻译PDF文档_多线程.py CHANGED
@@ -1,7 +1,6 @@
1
  from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down
2
  import re
3
  import unicodedata
4
- fast_debug = False
5
 
6
 
7
  def is_paragraph_break(match):
@@ -61,7 +60,6 @@ def clean_text(raw_text):
61
 
62
  return final_text.strip()
63
 
64
-
65
  def read_and_clean_pdf_text(fp):
66
  import fitz, re
67
  import numpy as np
@@ -69,19 +67,16 @@ def read_and_clean_pdf_text(fp):
69
  with fitz.open(fp) as doc:
70
  meta_txt = []
71
  meta_font = []
72
- for page in doc:
73
  # file_content += page.get_text()
74
  text_areas = page.get_text("dict") # 获取页面上的文本信息
75
 
 
 
 
 
 
76
 
77
- # # 行元提取 for each word segment with in line for each line for each block
78
- # meta_txt.extend( [ ["".join( [wtf['text'] for wtf in l['spans'] ]) for l in t['lines'] ] for t in text_areas['blocks'] if 'lines' in t])
79
- # meta_font.extend([ [ np.mean([wtf['size'] for wtf in l['spans'] ]) for l in t['lines'] ] for t in text_areas['blocks'] if 'lines' in t])
80
-
81
- # 块元提取 for each word segment with in line for each line for each block
82
- meta_txt.extend( [ " ".join(["".join( [wtf['text'] for wtf in l['spans'] ]) for l in t['lines'] ]) for t in text_areas['blocks'] if 'lines' in t])
83
- meta_font.extend([ np.mean( [ np.mean([wtf['size'] for wtf in l['spans'] ]) for l in t['lines'] ]) for t in text_areas['blocks'] if 'lines' in t])
84
-
85
  def 把字符太少的块清除为回车(meta_txt):
86
  for index, block_txt in enumerate(meta_txt):
87
  if len(block_txt) < 100:
@@ -123,19 +118,17 @@ def read_and_clean_pdf_text(fp):
123
  # 换行 -> 双换行
124
  meta_txt = meta_txt.replace('\n', '\n\n')
125
 
126
- # print(meta_txt)
127
-
128
- return meta_txt
129
 
130
  @CatchException
131
- def 批量翻译PDF文档(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT):
132
  import glob
133
  import os
134
 
135
  # 基本信息:功能、贡献者
136
  chatbot.append([
137
  "函数插件功能?",
138
- "批量总结PDF文档。函数插件贡献者: Binary-Husky, ValeriaWong, Eralien"])
139
  yield chatbot, history, '正常'
140
 
141
  # 尝试导入依赖,如果缺少依赖,则给出安装建议
@@ -174,82 +167,116 @@ def 批量翻译PDF文档(txt, top_p, temperature, chatbot, history, systemPromp
174
  return
175
 
176
  # 开始正式执行任务
177
- yield from 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt)
178
 
179
 
180
- def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, systemPromptTxt):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  import time
182
  import glob
183
  import os
184
  import fitz
185
  import tiktoken
186
- from concurrent.futures import ThreadPoolExecutor
187
- print('begin analysis on:', file_manifest)
188
  for index, fp in enumerate(file_manifest):
189
- ### 1. 读取PDF文件
190
- file_content = read_and_clean_pdf_text(fp)
191
- ### 2. 递归地切割PDF文件
192
  from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
193
  enc = tiktoken.get_encoding("gpt2")
194
- TOKEN_LIMIT_PER_FRAGMENT = 2048
195
  get_token_num = lambda txt: len(enc.encode(txt))
196
- # 分解
197
- paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
198
- txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
199
- print([get_token_num(frag) for frag in paper_fragments])
200
- ### 3. 逐个段落翻译
201
- ## 3.1. 多线程开始
202
- from request_llm.bridge_chatgpt import predict_no_ui_long_connection
203
- n_frag = len(paper_fragments)
204
- # 异步原子
205
- mutable = [["", time.time()] for _ in range(n_frag)]
206
- # 翻译函数
207
- def translate_(index, fragment, mutable):
208
- i_say = f"以下是你需要翻译的文章段落:{fragment}"
209
- # 请求gpt,需要一段时间
210
- gpt_say = predict_no_ui_long_connection(
211
- inputs=i_say, top_p=top_p, temperature=temperature, history=[], # ["请翻译:" if len(previous_result)!=0 else "", previous_result],
212
- sys_prompt="请你作为一个学术翻译,负责将给定的文章段落翻译成中文,要求语言简洁、精准、凝练。你只需要给出翻译后的文本,不能重复原文。",
213
- observe_window=mutable[index])
214
- return gpt_say
215
- ### 4. 异步任务开始
216
- executor = ThreadPoolExecutor(max_workers=16)
217
- # Submit tasks to the pool
218
- futures = [executor.submit(translate_, index, frag, mutable) for index, frag in enumerate(paper_fragments)]
219
-
220
- ### 5. UI主线程,在任务期间提供实时的前端显示
221
- cnt = 0
222
- while True:
223
- cnt += 1
224
- time.sleep(1)
225
- worker_done = [h.done() for h in futures]
226
- if all(worker_done):
227
- executor.shutdown(); break
228
- # 更好的UI视觉效果
229
- observe_win = []
230
- # 每个线程都要喂狗(看门狗)
231
- for thread_index, _ in enumerate(worker_done):
232
- mutable[thread_index][1] = time.time()
233
- # 在前端打印些好玩的东西
234
- for thread_index, _ in enumerate(worker_done):
235
- print_something_really_funny = "[ ...`"+mutable[thread_index][0][-30:].replace('\n','').replace('```','...').replace(' ','.').replace('<br/>','.....').replace('$','.')+"`... ]"
236
- observe_win.append(print_something_really_funny)
237
- stat_str = ''.join([f'执行中: {obs}\n\n' if not done else '已完成\n\n' for done, obs in zip(worker_done, observe_win)])
238
- chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt%10+1))]; msg = "正常"
239
- yield chatbot, history, msg
240
-
241
- # Wait for tasks to complete
242
- results = [future.result() for future in futures]
243
-
244
- print(results)
245
- # full_result += gpt_say
246
-
247
- # history.extend([fp, full_result])
248
-
249
- res = write_results_to_file(history)
250
- chatbot.append(("完成了吗?", res)); msg = "完成"
251
- yield chatbot, history, msg
252
-
253
-
254
- # if __name__ == '__main__':
255
- # pro()
 
1
  from toolbox import CatchException, report_execption, write_results_to_file, predict_no_ui_but_counting_down
2
  import re
3
  import unicodedata
 
4
 
5
 
6
  def is_paragraph_break(match):
 
60
 
61
  return final_text.strip()
62
 
 
63
  def read_and_clean_pdf_text(fp):
64
  import fitz, re
65
  import numpy as np
 
67
  with fitz.open(fp) as doc:
68
  meta_txt = []
69
  meta_font = []
70
+ for index, page in enumerate(doc):
71
  # file_content += page.get_text()
72
  text_areas = page.get_text("dict") # 获取页面上的文本信息
73
 
74
+ # 块元提取 for each word segment with in line for each line cross-line words for each block
75
+ meta_txt.extend( [ " ".join(["".join( [wtf['text'] for wtf in l['spans'] ]) for l in t['lines'] ]).replace('- ','') for t in text_areas['blocks'] if 'lines' in t])
76
+ meta_font.extend([ np.mean( [ np.mean([wtf['size'] for wtf in l['spans'] ]) for l in t['lines'] ]) for t in text_areas['blocks'] if 'lines' in t])
77
+ if index==0:
78
+ page_one_meta = [" ".join(["".join( [wtf['text'] for wtf in l['spans'] ]) for l in t['lines'] ]).replace('- ','') for t in text_areas['blocks'] if 'lines' in t]
79
 
 
 
 
 
 
 
 
 
80
  def 把字符太少的块清除为回车(meta_txt):
81
  for index, block_txt in enumerate(meta_txt):
82
  if len(block_txt) < 100:
 
118
  # 换行 -> 双换行
119
  meta_txt = meta_txt.replace('\n', '\n\n')
120
 
121
+ return meta_txt, page_one_meta
 
 
122
 
123
  @CatchException
124
+ def 批量翻译PDF文档(txt, top_p, temperature, chatbot, history, sys_prompt, WEB_PORT):
125
  import glob
126
  import os
127
 
128
  # 基本信息:功能、贡献者
129
  chatbot.append([
130
  "函数插件功能?",
131
+ "批量总结PDF文档。函数插件贡献者: Binary-Husky(二进制哈士奇)"])
132
  yield chatbot, history, '正常'
133
 
134
  # 尝试导入依赖,如果缺少依赖,则给出安装建议
 
167
  return
168
 
169
  # 开始正式执行任务
170
+ yield from 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, sys_prompt)
171
 
172
 
173
+ def request_gpt_model_in_new_thread_with_ui_alive(inputs, inputs_show_user, top_p, temperature, chatbot, history, sys_prompt, refresh_interval=0.2):
174
+ import time
175
+ from concurrent.futures import ThreadPoolExecutor
176
+ from request_llm.bridge_chatgpt import predict_no_ui_long_connection
177
+ # 用户反馈
178
+ chatbot.append([inputs_show_user, ""]); msg = '正常'
179
+ yield chatbot, [], msg
180
+ executor = ThreadPoolExecutor(max_workers=16)
181
+ mutable = ["", time.time()]
182
+ future = executor.submit(lambda:
183
+ predict_no_ui_long_connection(inputs=inputs, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt, observe_window=mutable)
184
+ )
185
+ while True:
186
+ # yield一次以刷新前端页面
187
+ time.sleep(refresh_interval)
188
+ # “喂狗”(看门狗)
189
+ mutable[1] = time.time()
190
+ if future.done(): break
191
+ chatbot[-1] = [chatbot[-1][0], mutable[0]]; msg = "正常"
192
+ yield chatbot, [], msg
193
+ return future.result()
194
+
195
+ def request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(inputs_array, inputs_show_user_array, top_p, temperature, chatbot, history_array, sys_prompt_array, refresh_interval, max_workers=10, scroller_max_len=30):
196
+ import time
197
+ from concurrent.futures import ThreadPoolExecutor
198
+ from request_llm.bridge_chatgpt import predict_no_ui_long_connection
199
+ assert len(inputs_array) == len(history_array)
200
+ assert len(inputs_array) == len(sys_prompt_array)
201
+ executor = ThreadPoolExecutor(max_workers=max_workers)
202
+ n_frag = len(inputs_array)
203
+ # 异步原子
204
+ mutable = [["", time.time()] for _ in range(n_frag)]
205
+ def _req_gpt(index, inputs, history, sys_prompt):
206
+ gpt_say = predict_no_ui_long_connection(
207
+ inputs=inputs, top_p=top_p, temperature=temperature, history=history, sys_prompt=sys_prompt, observe_window=mutable[index]
208
+ )
209
+ return gpt_say
210
+ # 异步任务开始
211
+ futures = [executor.submit(_req_gpt, index, inputs, history, sys_prompt) for index, inputs, history, sys_prompt in zip(range(len(inputs_array)), inputs_array, history_array, sys_prompt_array)]
212
+ cnt = 0
213
+ while True:
214
+ # yield一次以刷新前端页面
215
+ time.sleep(refresh_interval); cnt += 1
216
+ worker_done = [h.done() for h in futures]
217
+ if all(worker_done): executor.shutdown(); break
218
+ # 更好的UI视觉效果
219
+ observe_win = []
220
+ # 每个线程都要“喂狗”(看门狗)
221
+ for thread_index, _ in enumerate(worker_done): mutable[thread_index][1] = time.time()
222
+ # 在前端打印些好玩的东西
223
+ for thread_index, _ in enumerate(worker_done):
224
+ print_something_really_funny = "[ ...`"+mutable[thread_index][0][-scroller_max_len:].\
225
+ replace('\n','').replace('```','...').replace(' ','.').replace('<br/>','.....').replace('$','.')+"`... ]"
226
+ observe_win.append(print_something_really_funny)
227
+ stat_str = ''.join([f'执行中: {obs}\n\n' if not done else '已完成\n\n' for done, obs in zip(worker_done, observe_win)])
228
+ chatbot[-1] = [chatbot[-1][0], f'多线程操作已经开始,完成情况: \n\n{stat_str}' + ''.join(['.']*(cnt%10+1))]; msg = "正常"
229
+ yield chatbot, [], msg
230
+ # 异步任务结束
231
+ gpt_response_collection = []
232
+ for inputs_show_user, f in zip(inputs_show_user_array, futures):
233
+ gpt_res = f.result()
234
+ gpt_response_collection.extend([inputs_show_user, gpt_res])
235
+ return gpt_response_collection
236
+
237
+ def 解析PDF(file_manifest, project_folder, top_p, temperature, chatbot, history, sys_prompt):
238
  import time
239
  import glob
240
  import os
241
  import fitz
242
  import tiktoken
243
+ TOKEN_LIMIT_PER_FRAGMENT = 1600
244
+
245
  for index, fp in enumerate(file_manifest):
246
+ # 读取PDF文件
247
+ file_content, page_one = read_and_clean_pdf_text(fp)
248
+ # 递归地切割PDF文件
249
  from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
250
  enc = tiktoken.get_encoding("gpt2")
 
251
  get_token_num = lambda txt: len(enc.encode(txt))
252
+ # 分解文本
253
+ paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
254
+ txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
255
+ page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
256
+ txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
257
+ # 为了更好的效果,我们剥离Introduction之后的部分
258
+ paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
259
+ # 单线,获取文章meta信息
260
+ paper_meta_info = yield from request_gpt_model_in_new_thread_with_ui_alive(
261
+ inputs=f"以下是一篇学术论文的基础信息,请从中提取出“标题”、“收录会议或期刊”、“作者”、“摘要”、“编号”、“作者邮箱”这六个部分。请用markdown格式输出,最后用中文翻译摘要部分。请提取:{paper_meta}",
262
+ inputs_show_user=f"请从{fp}中提取出“标题”、“收录会议或期刊”等基本信息。",
263
+ top_p=top_p, temperature=temperature,
264
+ chatbot=chatbot, history=[],
265
+ sys_prompt="Your job is to collect information from materials。",
266
+ )
267
+ # 多线,翻译
268
+ gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
269
+ inputs_array = [f"以下是你需要翻译的文章段落:\n{frag}" for frag in paper_fragments],
270
+ inputs_show_user_array = [f"" for _ in paper_fragments],
271
+ top_p=top_p, temperature=temperature,
272
+ chatbot=chatbot,
273
+ history_array=[[paper_meta] for _ in paper_fragments],
274
+ sys_prompt_array=["请你作为一个学术翻译,把整个段落翻译成中文,要求语言简洁,禁止重复输出原文。" for _ in paper_fragments],
275
+ max_workers=16 # OpenAI所允许的最大并行过载
276
+ )
277
+
278
+ final = ["", paper_meta_info + '\n\n---\n\n---\n\n---\n\n'].extend(gpt_response_collection)
279
+ res = write_results_to_file(final)
280
+ chatbot.append((f"{fp}完成了吗?", res)); msg = "完成"
281
+ yield chatbot, history, msg
282
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
crazy_functions/高级功能函数模板.py CHANGED
@@ -5,7 +5,7 @@ import datetime
5
  @CatchException
6
  def 高阶功能模板函数(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT):
7
  history = [] # 清空历史,以免输入溢出
8
- chatbot.append(("这是什么功能?", "[Local Message] 请注意,您正在调用一个[函数插件]的模板,该函数面向希望实现更多有趣功能的开发者,它可以作为创建新功能函数的模板。为了做到简单易读,该函数只有25行代码,所以不会实时反馈文字流或心跳,请耐心等待程序输出完成。此外我们也提供可同步处理大量文件的多线程Demo供您参考。您若希望分享新的功能模组,请不吝PR!"))
9
  yield chatbot, history, '正常' # 由于请求gpt需要一段时间,我们先及时地做一次状态显示
10
 
11
  for i in range(5):
 
5
  @CatchException
6
  def 高阶功能模板函数(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT):
7
  history = [] # 清空历史,以免输入溢出
8
+ chatbot.append(("这是什么功能?", "[Local Message] 请注意,您正在调用一个[函数插件]的模板,该函数面向希望实现更多有趣功能的开发者,它可以作为创建新功能函数的模板(该函数只有25行代码)。此外我们也提供可同步处理大量文件的多线程Demo供您参考。您若希望分享新的功能模组,请不吝PR!"))
9
  yield chatbot, history, '正常' # 由于请求gpt需要一段时间,我们先及时地做一次状态显示
10
 
11
  for i in range(5):