shaocongma commited on
Commit
365213e
·
1 Parent(s): c9efba3
Files changed (5) hide show
  1. app.py +46 -36
  2. auto_backgrounds.py +38 -33
  3. latex_templates/pre_refs.bib +19 -16
  4. utils/prompts.py +9 -10
  5. utils/references.py +13 -13
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import os
3
  import openai
4
- from auto_backgrounds import generate_backgrounds, fake_generator, generate_draft
5
  from utils.file_operations import hash_name
6
 
7
  # note: App白屏bug:允许第三方cookie
@@ -9,12 +9,10 @@ from utils.file_operations import hash_name
9
  # 6. get logs when the procedure is not completed. *
10
  # 7. 自己的文件库; 更多的prompts
11
  # 8. Decide on how to generate the main part of a paper * (Langchain/AutoGPT
12
- # 9. Load .bibtex file to generate a pre-defined references list. *
13
  # 1. 把paper改成纯JSON?
14
  # 2. 实现别的功能
15
  # 3. Check API Key GPT-4 Support.
16
  # 8. Re-build some components using `langchain`
17
- # - in `references.py`, use PromptTemplates.format -> str
18
  # - in `gpt_interation`, use LLM
19
  # 5. 从提供的bib文件中 找到cite和citedby的文章, 计算embeddings; 从整个paper list中 根据cos距离进行排序; 选取max_refs的文章
20
  # future:
@@ -49,17 +47,12 @@ def clear_inputs(text1, text2):
49
 
50
 
51
  def wrapped_generator(paper_title, paper_description, openai_api_key=None,
52
- template="ICLR2022",
53
- cache_mode=IS_CACHE_AVAILABLE, generator=None):
54
  # if `cache_mode` is True, then follow the following steps:
55
  # check if "title"+"description" have been generated before
56
  # if so, download from the cloud storage, return it
57
  # if not, generate the result.
58
- if generator is None:
59
- # todo: add a Dropdown to select which generator to use.
60
- # generator = generate_backgrounds
61
- generator = generate_draft
62
- # generator = fake_generator
63
  if openai_api_key is not None:
64
  openai.api_key = openai_api_key
65
  openai.Model.list()
@@ -80,13 +73,17 @@ def wrapped_generator(paper_title, paper_description, openai_api_key=None,
80
  else:
81
  # generate the result.
82
  # output = fake_generate_backgrounds(title, description, openai_key)
83
- # todo: use `generator` to control which function to use.
84
- output = generator(paper_title, paper_description, template, "gpt-4")
 
 
85
  upload_file(output)
86
  return output
87
  else:
88
  # output = fake_generate_backgrounds(title, description, openai_key)
89
- output = generator(paper_title, paper_description, template, "gpt-4")
 
 
90
  return output
91
 
92
 
@@ -97,6 +94,14 @@ theme = gr.themes.Default(font=gr.themes.GoogleFont("Questrial"))
97
  # button_primary_background_fill="#281A39"
98
  # )
99
 
 
 
 
 
 
 
 
 
100
  with gr.Blocks(theme=theme) as demo:
101
  gr.Markdown('''
102
  # Auto-Draft: 文献整理辅助工具
@@ -107,11 +112,7 @@ with gr.Blocks(theme=theme) as demo:
107
 
108
  在这个Huggingface Organization里也提供一定额度的免费体验: [AUTO-ACADEMIC](https://huggingface.co/auto-academic).
109
 
110
- 如果有更多想法和建议欢迎加入QQ群里交流, 如果我在Space里更新了Key我会第一时间通知大家. 群号: ***249738228***.
111
-
112
- ## 用法
113
-
114
- 输入想要生成的论文名称(比如Playing Atari with Deep Reinforcement Learning), 点击Submit, 等待大概十分钟, 下载.zip格式的输出,在Overleaf上编译浏览.
115
  ''')
116
 
117
  with gr.Row():
@@ -124,6 +125,9 @@ with gr.Blocks(theme=theme) as demo:
124
 
125
  # 每个功能做一个tab
126
  with gr.Tab("学术论文"):
 
 
 
127
  title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
128
  label="Title", info="论文标题")
129
 
@@ -131,33 +135,38 @@ with gr.Blocks(theme=theme) as demo:
131
  description_pp = gr.Textbox(lines=5, label="Description (Optional)", visible=True,
132
  info="对希望生成的论文的一些描述. 包括这篇论文的创新点, 主要贡献, 等.")
133
 
134
- interactive = False
135
- gr.Markdown('''
136
- ## 下面的功能我只做了UI, 还没来得及实现功能.
137
- ''')
138
  with gr.Row():
139
  with gr.Column():
 
 
 
 
 
 
 
140
  gr.Markdown('''
141
- Upload .bib file (Optional)
142
-
143
- 通过上传.bib文件来控制GPT-4模型必须参考哪些文献.
144
  ''')
145
  bibtex_file = gr.File(label="Upload .bib file", file_types=["text"],
146
- interactive=interactive)
 
 
 
 
147
  with gr.Column():
148
  search_engine = gr.Dropdown(label="Search Engine",
149
  choices=["ArXiv", "Semantic Scholar", "Google Scholar", "None"],
150
  value= "Semantic Scholar",
151
- interactive=interactive,
152
- info="用于决定GPT-4用什么搜索引擎来搜索文献. 选择None的时候仅参考给定文献.")
153
- tldr = gr.Checkbox(value=True, label="TLDR;",
154
  info="选择此筐表示将使用Semantic Scholar的TLDR作为文献的总结.",
155
- interactive = interactive),
156
- use_cache = gr.Checkbox(label="总是重新生成",
157
- info="选择此筐表示将不会读取已经生成好的文章.",
158
- interactive = interactive)
159
- slider = gr.Slider(minimum=1, maximum=30, value=20, label="最大参考文献数目",
160
- info="过多参考文献会超出Token数限制导致报错,这里限制最大参考文献数目.")
161
 
162
  with gr.Row():
163
  clear_button_pp = gr.Button("Clear")
@@ -196,7 +205,8 @@ with gr.Blocks(theme=theme) as demo:
196
  file_output = gr.File(label="Output")
197
 
198
  clear_button_pp.click(fn=clear_inputs, inputs=[title, description_pp], outputs=[title, description_pp])
199
- submit_button_pp.click(fn=wrapped_generator, inputs=[title, description_pp, key], outputs=file_output)
 
200
 
201
  demo.queue(concurrency_count=1, max_size=5, api_open=False)
202
  demo.launch()
 
1
  import gradio as gr
2
  import os
3
  import openai
4
+ from auto_backgrounds import generate_backgrounds, generate_draft
5
  from utils.file_operations import hash_name
6
 
7
  # note: App白屏bug:允许第三方cookie
 
9
  # 6. get logs when the procedure is not completed. *
10
  # 7. 自己的文件库; 更多的prompts
11
  # 8. Decide on how to generate the main part of a paper * (Langchain/AutoGPT
 
12
  # 1. 把paper改成纯JSON?
13
  # 2. 实现别的功能
14
  # 3. Check API Key GPT-4 Support.
15
  # 8. Re-build some components using `langchain`
 
16
  # - in `gpt_interation`, use LLM
17
  # 5. 从提供的bib文件中 找到cite和citedby的文章, 计算embeddings; 从整个paper list中 根据cos距离进行排序; 选取max_refs的文章
18
  # future:
 
47
 
48
 
49
  def wrapped_generator(paper_title, paper_description, openai_api_key=None,
50
+ template="ICLR2022", tldr=True, max_num_refs=50, sections=None, bib_refs=None, model="gpt-4",
51
+ cache_mode=IS_CACHE_AVAILABLE):
52
  # if `cache_mode` is True, then follow the following steps:
53
  # check if "title"+"description" have been generated before
54
  # if so, download from the cloud storage, return it
55
  # if not, generate the result.
 
 
 
 
 
56
  if openai_api_key is not None:
57
  openai.api_key = openai_api_key
58
  openai.Model.list()
 
73
  else:
74
  # generate the result.
75
  # output = fake_generate_backgrounds(title, description, openai_key)
76
+ output =generate_draft(paper_title, paper_description, template=template,
77
+ tldr=tldr, max_num_refs=max_num_refs,
78
+ sections=sections, bib_refs=bib_refs, model=model)
79
+ # output = generate_draft(paper_title, paper_description, template, "gpt-4")
80
  upload_file(output)
81
  return output
82
  else:
83
  # output = fake_generate_backgrounds(title, description, openai_key)
84
+ output =generate_draft(paper_title, paper_description, template=template,
85
+ tldr=tldr, max_num_refs=max_num_refs,
86
+ sections=sections, bib_refs=bib_refs, model=model)
87
  return output
88
 
89
 
 
94
  # button_primary_background_fill="#281A39"
95
  # )
96
 
97
+ ACADEMIC_PAPER = """## 一键生成论文初稿
98
+
99
+ 1. 在Title文本框中输入想要生成的论文名称(比如Playing Atari with Deep Reinforcement Learning).
100
+ 2. 点击Submit. 等待大概十分钟.
101
+ 3. 在右侧下载.zip格式的输出,在Overleaf上编译浏览.
102
+ """
103
+
104
+
105
  with gr.Blocks(theme=theme) as demo:
106
  gr.Markdown('''
107
  # Auto-Draft: 文献整理辅助工具
 
112
 
113
  在这个Huggingface Organization里也提供一定额度的免费体验: [AUTO-ACADEMIC](https://huggingface.co/auto-academic).
114
 
115
+ 如果有更多想法和建议欢迎加入QQ群里交流, 如果我在Space里更新了Key我会第一时间通知大家. 群号: ***249738228***.
 
 
 
 
116
  ''')
117
 
118
  with gr.Row():
 
125
 
126
  # 每个功能做一个tab
127
  with gr.Tab("学术论文"):
128
+ gr.Markdown(ACADEMIC_PAPER)
129
+
130
+
131
  title = gr.Textbox(value="Playing Atari with Deep Reinforcement Learning", lines=1, max_lines=1,
132
  label="Title", info="论文标题")
133
 
 
135
  description_pp = gr.Textbox(lines=5, label="Description (Optional)", visible=True,
136
  info="对希望生成的论文的一些描述. 包括这篇论文的创新点, 主要贡献, 等.")
137
 
 
 
 
 
138
  with gr.Row():
139
  with gr.Column():
140
+ with gr.Row():
141
+ template = gr.Dropdown(label="Template", choices=["ICLR2022"], value="ICLR2022",
142
+ interactive=False,
143
+ info="生成论文的参考模板. (暂不支持修改)")
144
+ model_selection = gr.Dropdown(label="Model", choices=["gpt-4", "gpt-3.5-turbo"], value="gpt-4",
145
+ interactive=True,
146
+ info="生成论文用到的语言模型.")
147
  gr.Markdown('''
148
+ 上传.bib文件提供AI需要参考的文献.
 
 
149
  ''')
150
  bibtex_file = gr.File(label="Upload .bib file", file_types=["text"],
151
+ interactive=True)
152
+ gr.Examples(
153
+ examples=["latex_templates/pre_refs.bib"],
154
+ inputs=bibtex_file
155
+ )
156
  with gr.Column():
157
  search_engine = gr.Dropdown(label="Search Engine",
158
  choices=["ArXiv", "Semantic Scholar", "Google Scholar", "None"],
159
  value= "Semantic Scholar",
160
+ interactive=False,
161
+ info="用于决定GPT-4用什么搜索引擎来搜索文献. (暂不支持修改)")
162
+ tldr_checkbox = gr.Checkbox(value=True, label="TLDR;",
163
  info="选择此筐表示将使用Semantic Scholar的TLDR作为文献的总结.",
164
+ interactive = True)
165
+ sections = gr.CheckboxGroup(choices=["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"],
166
+ type="value", label="生成章节", interactive = True,
167
+ value=["introduction", "related works"])
168
+ slider = gr.Slider(minimum=1, maximum=100, value=50, step=1,
169
+ interactive = True, label="最大参考文献数目")
170
 
171
  with gr.Row():
172
  clear_button_pp = gr.Button("Clear")
 
205
  file_output = gr.File(label="Output")
206
 
207
  clear_button_pp.click(fn=clear_inputs, inputs=[title, description_pp], outputs=[title, description_pp])
208
+ # submit_button_pp.click(fn=wrapped_generator, inputs=[title, description_pp, key, template, tldr, slider, sections, bibtex_file], outputs=file_output)
209
+ submit_button_pp.click(fn=wrapped_generator, inputs=[title, description_pp, key, template, tldr_checkbox, slider, sections, bibtex_file, model_selection ], outputs=file_output)
210
 
211
  demo.queue(concurrency_count=1, max_size=5, api_open=False)
212
  demo.launch()
auto_backgrounds.py CHANGED
@@ -30,8 +30,29 @@ def log_usage(usage, generating_target, print_out=True):
30
  print(message)
31
  logging.info(message)
32
 
33
- def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
34
- tldr=False, max_kw_refs=4, max_num_refs=10):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  print("Generation setup...")
36
  paper = {}
37
  paper_body = {}
@@ -44,24 +65,16 @@ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
44
  print("Initialize the paper information ...")
45
  input_dict = {"title": title, "description": description}
46
  # keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
47
- keywords, usage = keywords_generation(input_dict) #todo: handle format error here
48
- print(f"keywords: {keywords}")
49
  log_usage(usage, "keywords")
50
 
51
  # generate keywords dictionary
52
  keywords = {keyword:max_kw_refs for keyword in keywords}
53
- # tmp = {}
54
- # for keyword in json.loads(keywords):
55
- # tmp[keyword] = max_kw_refs
56
- # keywords = tmp
57
- print(f"keywords: {keywords}")
58
 
59
- ref = References()
60
  ref.collect_papers(keywords, tldr=tldr)
61
- # todo: use `all_paper_ids` to check if all citations are in this list
62
- # in tex_processing, remove all duplicated ids
63
- # find most relevant papers; max_num_refs
64
- all_paper_ids = ref.to_bibtex(bibtex_path)
65
 
66
  print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
67
 
@@ -70,11 +83,12 @@ def _generation_setup(title, description="", template="ICLR2022", model="gpt-4",
70
  paper["references"] = ref.to_prompts()
71
  paper["body"] = paper_body
72
  paper["bibtex"] = bibtex_path
73
- return paper, destination_folder, all_paper_ids
74
 
75
 
76
 
77
  def generate_backgrounds(title, description="", template="ICLR2022", model="gpt-4"):
 
78
  paper, destination_folder, _ = _generation_setup(title, description, template, model)
79
 
80
  for section in ["introduction", "related works", "backgrounds"]:
@@ -92,25 +106,15 @@ def generate_backgrounds(title, description="", template="ICLR2022", model="gpt-
92
  return make_archive(destination_folder, filename)
93
 
94
 
95
- def fake_generator(title, description="", template="ICLR2022", model="gpt-4"):
96
- """
97
- This function is used to test the whole pipeline without calling OpenAI API.
98
- """
99
- input_dict = {"title": title, "description": description, "generator": "generate_draft"}
100
- filename = hash_name(input_dict) + ".zip"
101
- return make_archive("sample-output.pdf", filename)
102
-
103
-
104
- def generate_draft(title, description="", template="ICLR2022", model="gpt-4", tldr=True, max_kw_refs=4):
105
- paper, destination_folder, _ = _generation_setup(title, description, template, model, tldr, max_kw_refs)
106
- raise
107
- # todo: `list_of_methods` failed to be generated; find a solution ...
108
- # print("Generating figures ...")
109
- # usage = figures_generation(paper, destination_folder, model="gpt-3.5-turbo")
110
- # log_usage(usage, "figures")
111
 
112
- # for section in ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]:
113
- for section in ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]:
 
114
  max_attempts = 4
115
  attempts_count = 0
116
  while attempts_count < max_attempts:
@@ -127,6 +131,7 @@ def generate_draft(title, description="", template="ICLR2022", model="gpt-4", tl
127
 
128
  input_dict = {"title": title, "description": description, "generator": "generate_draft"}
129
  filename = hash_name(input_dict) + ".zip"
 
130
  return make_archive(destination_folder, filename)
131
 
132
 
 
30
  print(message)
31
  logging.info(message)
32
 
33
+ def _generation_setup(title, description="", template="ICLR2022", tldr=False,
34
+ max_kw_refs=10, max_num_refs=50, bib_refs=None):
35
+ """
36
+ This function handles the setup process for paper generation; it contains three folds
37
+ 1. Copy the template to the outputs folder. Create the log file `generation.log`
38
+ 2. Collect references based on the given `title` and `description`
39
+ 3. Generate the basic `paper` object (a dictionary)
40
+
41
+ Parameters:
42
+ title (str): The title of the paper.
43
+ description (str, optional): A short description or abstract for the paper. Defaults to an empty string.
44
+ template (str, optional): The template to be used for paper generation. Defaults to "ICLR2022".
45
+ tldr (bool, optional): A flag indicating whether a TL;DR (Too Long; Didn't Read) summary should be generated for the collected papers. Defaults to False.
46
+ max_kw_refs (int, optional): The maximum number of references that can be associated with each keyword. Defaults to 10.
47
+ max_num_refs (int, optional): The maximum number of references that can be included in the paper. Defaults to 50.
48
+ bib_refs (list, optional): A list of pre-existing references in BibTeX format. Defaults to None.
49
+
50
+ Returns:
51
+ tuple: A tuple containing the following elements:
52
+ - paper (dict): A dictionary containing the generated paper information.
53
+ - destination_folder (str): The path to the destination folder where the generation log is saved.
54
+ - all_paper_ids (list): A list of all paper IDs collected for the references.
55
+ """
56
  print("Generation setup...")
57
  paper = {}
58
  paper_body = {}
 
65
  print("Initialize the paper information ...")
66
  input_dict = {"title": title, "description": description}
67
  # keywords, usage = keywords_generation(input_dict, model="gpt-3.5-turbo", max_kw_refs=max_kw_refs)
68
+ keywords, usage = keywords_generation(input_dict)
 
69
  log_usage(usage, "keywords")
70
 
71
  # generate keywords dictionary
72
  keywords = {keyword:max_kw_refs for keyword in keywords}
73
+ print(f"keywords: {keywords}\n\n")
 
 
 
 
74
 
75
+ ref = References(title, bib_refs)
76
  ref.collect_papers(keywords, tldr=tldr)
77
+ all_paper_ids = ref.to_bibtex(bibtex_path, max_num_refs) #todo: max_num_refs has not implemented yet
 
 
 
78
 
79
  print(f"The paper information has been initialized. References are saved to {bibtex_path}.")
80
 
 
83
  paper["references"] = ref.to_prompts()
84
  paper["body"] = paper_body
85
  paper["bibtex"] = bibtex_path
86
+ return paper, destination_folder, all_paper_ids #todo: use `all_paper_ids` to check if all citations are in this list
87
 
88
 
89
 
90
  def generate_backgrounds(title, description="", template="ICLR2022", model="gpt-4"):
91
+ # todo: to match the current generation setup
92
  paper, destination_folder, _ = _generation_setup(title, description, template, model)
93
 
94
  for section in ["introduction", "related works", "backgrounds"]:
 
106
  return make_archive(destination_folder, filename)
107
 
108
 
109
+ def generate_draft(title, description="", template="ICLR2022",
110
+ model="gpt-4", tldr=True, max_kw_refs=10, max_num_refs=30, sections=None, bib_refs=None):
111
+ # pre-processing `sections` parameter;
112
+ if sections is None:
113
+ sections = ["introduction", "related works", "backgrounds", "methodology", "experiments", "conclusion", "abstract"]
 
 
 
 
 
 
 
 
 
 
 
114
 
115
+ # todo: add more parameters; select which section to generate; select maximum refs.
116
+ paper, destination_folder, _ = _generation_setup(title, description, template, tldr, max_kw_refs, max_num_refs, bib_refs)
117
+ for section in sections:
118
  max_attempts = 4
119
  attempts_count = 0
120
  while attempts_count < max_attempts:
 
131
 
132
  input_dict = {"title": title, "description": description, "generator": "generate_draft"}
133
  filename = hash_name(input_dict) + ".zip"
134
+ print("\nMission completed.\n")
135
  return make_archive(destination_folder, filename)
136
 
137
 
latex_templates/pre_refs.bib CHANGED
@@ -1,17 +1,20 @@
 
 
 
 
 
 
 
 
1
 
2
- @article{1512.07669,
3
- title = {Reinforcement Learning: Stochastic Approximation Algorithms for Markov
4
- Decision Processes},
5
- author = {Vikram Krishnamurthy},
6
- journal={arXiv preprint arXiv:1512.07669},
7
- year = {2015},
8
- url = {http://arxiv.org/abs/1512.07669v1}
9
- }
10
-
11
- @article{1511.02377,
12
- title = {The Value Functions of Markov Decision Processes},
13
- author = {Ehud Lehrer , Eilon Solan , Omri N. Solan},
14
- journal={arXiv preprint arXiv:1511.02377},
15
- year = {2015},
16
- url = {http://arxiv.org/abs/1511.02377v1}
17
- }
 
1
+ @inproceedings{ma2020understanding,
2
+ title={Understanding the impact of model incoherence on convergence of incremental sgd with random reshuffle},
3
+ author={Ma, Shaocong and Zhou, Yi},
4
+ booktitle={International Conference on Machine Learning},
5
+ pages={6565--6574},
6
+ year={2020},
7
+ organization={PMLR}
8
+ }
9
 
10
+ @inproceedings{ma2020variance,
11
+ author = {Ma, Shaocong and Zhou, Yi and Zou, Shaofeng},
12
+ booktitle = {Advances in Neural Information Processing Systems},
13
+ editor = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin},
14
+ pages = {14796--14806},
15
+ publisher = {Curran Associates, Inc.},
16
+ title = {Variance-Reduced Off-Policy TDC Learning: Non-Asymptotic Convergence Analysis},
17
+ url = {https://proceedings.neurips.cc/paper_files/paper/2020/file/a992995ef4f0439b258f2360dbb85511-Paper.pdf},
18
+ volume = {33},
19
+ year = {2020}
20
+ }
 
 
 
 
 
utils/prompts.py CHANGED
@@ -33,16 +33,15 @@ def generate_experiments_prompts(paper_info):
33
  ######################################################################################################################
34
 
35
  # two parameters: min_refs_num, max_refs_num
36
- keywords_system_template = """You are an assistant designed to provide accurate and informative keywords of searching academic papers.
37
- Instructions
38
- - Your response should always be a Python list; e.g. ["keyword1", "keyword2", "keyword3"]
39
- - The length of list should between {min_refs_num} and {max_refs_num}
40
- - Use specific phrases as keywords and avoid using too general words (e.g. machine learning)"""
41
- # keywords_system_template = """You are an assistant designed to provide related research fields of academic papers.
42
- # Instructions:
43
- # - Your response should follow the following output format: ["field1", "field2", "field3"]\n
44
- # - The length of this Python list should between {min_refs_num} and {max_refs_num}\n
45
- # - Use specific phrases instead of using too general words (e.g. machine learning)"""
46
 
47
  # two parameters: min_refs_num, max_refs_num
48
  exp_methods_system_template = """You are an assistant designed to provide most related algorithms or methods to a given paper title.
 
33
  ######################################################################################################################
34
 
35
  # two parameters: min_refs_num, max_refs_num
36
+ # keywords_system_template = """You are an assistant designed to provide accurate and informative keywords of searching academic papers.
37
+ # Instructions
38
+ # - Your response should always be a Python list; e.g. ["keyword1", "keyword2", "keyword3"]
39
+ # - The length of list should between {min_refs_num} and {max_refs_num}
40
+ # - Use specific phrases as keywords and avoid using too general words (e.g. machine learning)"""
41
+ keywords_system_template = """You are an assistant designed to provide accurate and informative keywords of searching academic papers.\n
42
+ Instructions:\n
43
+ - Your response should follow the following output format: ["field1", "field2", "field3", "field4"]\n
44
+ - The length of this Python list should between {min_refs_num} and {max_refs_num}."""
 
45
 
46
  # two parameters: min_refs_num, max_refs_num
47
  exp_methods_system_template = """You are an assistant designed to provide most related algorithms or methods to a given paper title.
utils/references.py CHANGED
@@ -150,7 +150,6 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
150
  # turn the search result to a list of paper dictionary.
151
  papers_ss = []
152
  for raw_paper in search_results_ss:
153
- print(raw_paper['title'])
154
  if raw_paper["abstract"] is None:
155
  continue
156
 
@@ -170,6 +169,8 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
170
  abstract = raw_paper['tldr']['text']
171
  else:
172
  abstract = remove_newlines(raw_paper['abstract'])
 
 
173
  embeddings_dict = raw_paper.get('embedding')
174
  if embeddings_dict is None:
175
  continue
@@ -203,14 +204,13 @@ def _collect_papers_ss(keyword, counts=3, tldr=False):
203
  ######################################################################################################################
204
 
205
  class References:
206
- def __init__(self):
207
- # if load_papers:
208
- # # todo: (1) too large bibtex may make have issues on token limitations; may truncate to 5 or 10
209
- # # (2) google scholar didn't give a full abstract for some papers ...
210
- # # (3) may use langchain to support long input
211
- # self.papers = load_papers_from_bibtex(load_papers)
212
- # else:
213
- self.papers = {}
214
 
215
  def load_papers(self, bibtex, keyword):
216
  self.papers[keyword] = load_papers_from_bibtex(bibtex)
@@ -230,14 +230,14 @@ class References:
230
  for key, counts in keywords_dict.items():
231
  self.papers[key] = _collect_papers_ss(key, counts, tldr)
232
 
233
- def find_relevant(self, max_refs=30):
234
- # todo: use embeddings to evaluate
235
- pass
236
 
237
- def to_bibtex(self, path_to_bibtex="ref.bib"):
238
  """
239
  Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
240
  """
 
 
 
241
  papers = self._get_papers(keyword = "_all")
242
 
243
  # clear the bibtex file
 
150
  # turn the search result to a list of paper dictionary.
151
  papers_ss = []
152
  for raw_paper in search_results_ss:
 
153
  if raw_paper["abstract"] is None:
154
  continue
155
 
 
169
  abstract = raw_paper['tldr']['text']
170
  else:
171
  abstract = remove_newlines(raw_paper['abstract'])
172
+
173
+ # some papers have no embeddings; handle this case
174
  embeddings_dict = raw_paper.get('embedding')
175
  if embeddings_dict is None:
176
  continue
 
204
  ######################################################################################################################
205
 
206
  class References:
207
+ def __init__(self, title, load_papers):
208
+ if load_papers is not None:
209
+ self.papers = {}
210
+ self.papers["customized_refs"] = load_papers_from_bibtex(load_papers)
211
+ else:
212
+ self.papers = {}
213
+ self.title = title
 
214
 
215
  def load_papers(self, bibtex, keyword):
216
  self.papers[keyword] = load_papers_from_bibtex(bibtex)
 
230
  for key, counts in keywords_dict.items():
231
  self.papers[key] = _collect_papers_ss(key, counts, tldr)
232
 
 
 
 
233
 
234
+ def to_bibtex(self, path_to_bibtex="ref.bib", max_num_refs=50):
235
  """
236
  Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`.
237
  """
238
+ # todo:
239
+ # use embeddings to evaluate; keep top k relevant references in papers
240
+ # send (title, .bib file) to evaluate embeddings; recieve truncated papers
241
  papers = self._get_papers(keyword = "_all")
242
 
243
  # clear the bibtex file