arxiv-cards / arxiv_util.py
eliolio's picture
Duplicate from EuroSciPy2022/arxiv-cards
5ba4f18
from collections import namedtuple # later use py3.7 dataclasses
import urllib
import feedparser
import pdb
ArxivPaper = namedtuple("ArxivPaper", ["title", "authors", "abstract", "linktopdf", "linktoabs", "arxiv_id"])
def arxiv_url_sanitizer(url):
"""
as of now, just converts
arxiv.org/pdf/ to arxiv.org/abs
"""
# if its an arxiv pdf url then
if url.find("pdf") != -1:
url = url.replace("/pdf","/abs")
url = url.replace(".pdf","")
return url
def get_paper_info(url):
"""
Given an arxiv url returns
a ArxivPaper object with fields
title : str
authors : str
abstract : str
linktopdf : str
linktoabs : str
arxiv_id : str
"""
arxiv_id = url.split("/")[-1]
arxiv_searchurl = "http://export.arxiv.org/api/query?id_list={}".format(arxiv_id)
try:
atom_feed = urllib.request.urlopen(arxiv_searchurl)
except urllib.error.HTTPError as e:
# print("Couldn't retrieve : {}".format(arxiv_searchurl))
raise RuntimeError("Trouble fetching ArXiv Id : {}".format(arxiv_id))
parsed_feed = feedparser.parse(atom_feed)
paper = parsed_feed["entries"][0]
title = paper["title"]
authors = paper["authors"]
if len(authors)>5:
authors = authors[:6]
authors[5] = {'name': 'and others...'}
abstract = paper["summary"]
linktopdf = None
linktoabs = None
for link_dict in paper["links"]:
if link_dict["type"].find("html") != -1:
linktoabs = link_dict["href"]
elif link_dict["type"].find("pdf")!= -1:
linktopdf = link_dict["href"]
# comment = paper["arxiv_comment"] # Not there in all arxiv pages.
return ArxivPaper(title, authors, abstract, linktopdf, linktoabs, arxiv_id)