Spaces:
Running
Running
# Each `paper` is a dictionary containing: | |
# (1) paper_id (2) title (3) authors (4) year (5) link (6) abstract (7) journal | |
# | |
# Generate references: | |
# `Reference` class: | |
# 1. Read a given .bib file to collect papers; use `search_paper_abstract` method to fill missing abstract. | |
# 2. Given some keywords; use ArXiv or Semantic Scholar API to find papers. | |
# 3. Generate bibtex from the selected papers. --> to_bibtex() | |
# 4. Generate prompts from the selected papers: --> to_prompts() | |
# A sample prompt: {"paper_id": "paper summary"} | |
import requests | |
import re | |
import bibtexparser | |
from scholarly import scholarly | |
from scholarly import ProxyGenerator | |
###################################################################################################################### | |
# Some basic tools | |
###################################################################################################################### | |
def remove_newlines(serie): | |
# This function is applied to the abstract of each paper to reduce the length of prompts. | |
serie = serie.replace('\n', ' ') | |
serie = serie.replace('\\n', ' ') | |
serie = serie.replace(' ', ' ') | |
serie = serie.replace(' ', ' ') | |
return serie | |
def search_paper_abstract(title): | |
pg = ProxyGenerator() | |
success = pg.ScraperAPI("921b16f94d701308b9d9b4456ddde155") | |
scholarly.use_proxy(pg) | |
# input the title of a paper, return its abstract | |
search_query = scholarly.search_pubs(title) | |
paper = next(search_query) | |
return remove_newlines(paper['bib']['abstract']) | |
def load_papers_from_bibtex(bib_file_path): | |
with open(bib_file_path) as bibtex_file: | |
bib_database = bibtexparser.load(bibtex_file) | |
if len(bib_database.entries) == 0: | |
return [] | |
else: | |
bib_papers = [] | |
for bibitem in bib_database.entries: | |
paper_id = bibitem.get("ID") | |
title = bibitem.get("title") | |
if title is None: | |
continue | |
journal = bibitem.get("journal") | |
year = bibitem.get("year") | |
author = bibitem.get("author") | |
abstract = bibitem.get("abstract") | |
if abstract is None: | |
abstract = search_paper_abstract(title) | |
result = { | |
"paper_id": paper_id, | |
"title": title, | |
"link": "", | |
"abstract": abstract, | |
"authors": author, | |
"year": year, | |
"journal": journal | |
} | |
bib_papers.append(result) | |
return bib_papers | |
###################################################################################################################### | |
# Semantic Scholar (SS) API | |
###################################################################################################################### | |
def ss_search(keywords, limit=20, fields=None): | |
# space between the query to be removed and replaced with + | |
if fields is None: | |
fields = ["title", "abstract", "venue", "year", "authors", "tldr", "embedding", "externalIds"] | |
keywords = keywords.lower() | |
keywords = keywords.replace(" ", "+") | |
url = f'https://api.semanticscholar.org/graph/v1/paper/search?query={keywords}&limit={limit}&fields={",".join(fields)}' | |
# headers = {"Accept": "*/*", "x-api-key": constants.S2_KEY} | |
headers = {"Accept": "*/*"} | |
response = requests.get(url, headers=headers, timeout=30) | |
return response.json() | |
def _collect_papers_ss(keyword, counts=3, tldr=False): | |
def externalIds2link(externalIds): | |
# Sample externalIds: | |
# "{'MAG': '2932819148', 'DBLP': 'conf/icml/HaarnojaZAL18', 'ArXiv': '1801.01290', 'CorpusId': 28202810}" | |
if externalIds: | |
# Supports ArXiv, MAG, ACL, PubMed, Medline, PubMedCentral, DBLP, DOI | |
# priority: DBLP > arXiv > (todo: MAG > CorpusId > DOI > ACL > PubMed > Mdeline > PubMedCentral) | |
# DBLP | |
dblp_id = externalIds.get('DBLP') | |
if dblp_id is not None: | |
dblp_link = f"dblp.org/rec/{dblp_id}" | |
return dblp_link | |
# arXiv | |
arxiv_id = externalIds.get('ArXiv') | |
if arxiv_id is not None: | |
arxiv_link = f"arxiv.org/abs/{arxiv_id}" | |
return arxiv_link | |
return "" | |
else: | |
# if this is an empty dictionary, return an empty string | |
return "" | |
def extract_paper_id(last_name, year_str, title): | |
pattern = r'^\w+' | |
words = re.findall(pattern, title) | |
# return last_name + year_str + title.split(' ', 1)[0] | |
try: | |
output = last_name + year_str + words[0] | |
except IndexError: | |
output = last_name + year_str + title[:4] | |
return output | |
def extract_author_info(raw_authors): | |
authors = [author['name'] for author in raw_authors] | |
authors_str = " and ".join(authors) | |
try: | |
last_name = authors[0].split()[-1] | |
except IndexError: | |
last_name = "ma" | |
# pattern = r'^\w+' | |
# last_name = re.findall(pattern, authors[0]) | |
return authors_str, last_name | |
def parse_search_results(search_results_ss): | |
# turn the search result to a list of paper dictionary. | |
papers_ss = [] | |
for raw_paper in search_results_ss: | |
if raw_paper["abstract"] is None: | |
continue | |
authors_str, last_name = extract_author_info(raw_paper['authors']) | |
year_str = str(raw_paper['year']) | |
title = raw_paper['title'] | |
# some journal may contain &; replace it. e.g. journal={IEEE Power & Energy Society General Meeting} | |
journal = raw_paper['venue'].replace("&", "\\&") | |
if not journal: | |
journal = "arXiv preprint" | |
paper_id = extract_paper_id(last_name, year_str, title).lower() | |
link = externalIds2link(raw_paper['externalIds']) | |
if tldr and raw_paper['tldr'] is not None: | |
abstract = raw_paper['tldr']['text'] | |
else: | |
abstract = remove_newlines(raw_paper['abstract']) | |
result = { | |
"paper_id": paper_id, | |
"title": title, | |
"abstract": abstract, | |
"link": link, | |
"authors": authors_str, | |
"year": year_str, | |
"journal": journal | |
} | |
papers_ss.append(result) | |
return papers_ss | |
raw_results = ss_search(keyword, limit=counts) | |
if raw_results is not None: | |
search_results = raw_results['data'] | |
else: | |
search_results = [] | |
results = parse_search_results(search_results) | |
return results | |
###################################################################################################################### | |
# ArXiv API | |
###################################################################################################################### | |
def _collect_papers_arxiv(keyword, counts=3, tldr=False): | |
# Build the arXiv API query URL with the given keyword and other parameters | |
def build_query_url(keyword, results_limit=3, sort_by="relevance", sort_order="descending"): | |
base_url = "http://export.arxiv.org/api/query?" | |
query = f"search_query=all:{keyword}&start=0&max_results={results_limit}" | |
query += f"&sortBy={sort_by}&sortOrder={sort_order}" | |
return base_url + query | |
# Fetch search results from the arXiv API using the constructed URL | |
def fetch_search_results(query_url): | |
response = requests.get(query_url) | |
return response.text | |
# Parse the XML content of the API response to extract paper information | |
def parse_results(content): | |
from xml.etree import ElementTree as ET | |
root = ET.fromstring(content) | |
namespace = "{http://www.w3.org/2005/Atom}" | |
entries = root.findall(f"{namespace}entry") | |
results = [] | |
for entry in entries: | |
title = entry.find(f"{namespace}title").text | |
link = entry.find(f"{namespace}id").text | |
summary = entry.find(f"{namespace}summary").text | |
summary = remove_newlines(summary) | |
# Extract the authors | |
authors = entry.findall(f"{namespace}author") | |
author_list = [] | |
for author in authors: | |
name = author.find(f"{namespace}name").text | |
author_list.append(name) | |
authors_str = " and ".join(author_list) | |
# Extract the year | |
published = entry.find(f"{namespace}published").text | |
year = published.split("-")[0] | |
founds = re.search(r'\d+\.\d+', link) | |
if founds is None: | |
# some links are not standard; such as "https://arxiv.org/abs/cs/0603127v1". | |
# will be solved in the future. | |
continue | |
else: | |
arxiv_id = founds.group(0) | |
journal = f"arXiv preprint arXiv:{arxiv_id}" | |
result = { | |
"paper_id": arxiv_id, | |
"title": title, | |
"link": link, | |
"abstract": summary, | |
"authors": authors_str, | |
"year": year, | |
"journal": journal | |
} | |
results.append(result) | |
return results | |
query_url = build_query_url(keyword, counts) | |
content = fetch_search_results(query_url) | |
results = parse_results(content) | |
return results | |
###################################################################################################################### | |
# References Class | |
###################################################################################################################### | |
class References: | |
def __init__(self, load_papers=""): | |
if load_papers: | |
# todo: (1) too large bibtex may make have issues on token limitations; may truncate to 5 or 10 | |
# (2) google scholar didn't give a full abstract for some papers ... | |
# (3) may use langchain to support long input | |
self.papers = load_papers_from_bibtex(load_papers) | |
else: | |
self.papers = [] | |
def collect_papers(self, keywords_dict, method="arxiv", tldr=False): | |
""" | |
keywords_dict: | |
{"machine learning": 5, "language model": 2}; | |
the first is the keyword, the second is how many references are needed. | |
""" | |
match method: | |
case "arxiv": | |
process = _collect_papers_arxiv | |
case "ss": | |
process = _collect_papers_ss | |
case _: | |
raise NotImplementedError("Other sources have not been not supported yet.") | |
for key, counts in keywords_dict.items(): | |
self.papers = self.papers + process(key, counts, tldr) | |
seen = set() | |
papers = [] | |
for paper in self.papers: | |
paper_id = paper["paper_id"] | |
if paper_id not in seen: | |
seen.add(paper_id) | |
papers.append(paper) | |
self.papers = papers | |
def to_bibtex(self, path_to_bibtex="ref.bib"): | |
""" | |
Turn the saved paper list into bibtex file "ref.bib". Return a list of all `paper_id`. | |
""" | |
papers = self.papers | |
# clear the bibtex file | |
with open(path_to_bibtex, "w", encoding="utf-8") as file: | |
file.write("") | |
bibtex_entries = [] | |
paper_ids = [] | |
for paper in papers: | |
bibtex_entry = f"""@article{{{paper["paper_id"]}, | |
title = {{{paper["title"]}}}, | |
author = {{{paper["authors"]}}}, | |
journal={{{paper["journal"]}}}, | |
year = {{{paper["year"]}}}, | |
url = {{{paper["link"]}}} | |
}}""" | |
bibtex_entries.append(bibtex_entry) | |
paper_ids.append(paper["paper_id"]) | |
# Save the generated BibTeX entries to a file | |
with open(path_to_bibtex, "a", encoding="utf-8") as file: | |
file.write(bibtex_entry) | |
file.write("\n\n") | |
return paper_ids | |
def to_prompts(self): | |
# `prompts`: | |
# {"paper1_bibtex_id": "paper_1_abstract", "paper2_bibtex_id": "paper2_abstract"} | |
# this will be used to instruct GPT model to cite the correct bibtex entry. | |
prompts = {} | |
for paper in self.papers: | |
prompts[paper["paper_id"]] = paper["abstract"] | |
return prompts | |
if __name__ == "__main__": | |
# refs = References() | |
# keywords_dict = { | |
# "Deep Q-Networks": 15, | |
# "Policy Gradient Methods": 24, | |
# "Actor-Critic Algorithms": 4, | |
# "Model-Based Reinforcement Learning": 13, | |
# "Exploration-Exploitation Trade-off": 7 | |
# } | |
# refs.collect_papers(keywords_dict, method="ss", tldr=True) | |
# for p in refs.papers: | |
# print(p["paper_id"]) | |
# print(len(refs.papers)) | |
bib = "D:\\Projects\\auto-draft\\latex_templates\\pre_refs.bib" | |
papers = load_papers_from_bibtex(bib) | |
for paper in papers: | |
print(paper) | |