yiyixin commited on
Commit
6be34e2
·
1 Parent(s): 40d70d9
Files changed (8) hide show
  1. CONTRIBUTION.md +11 -0
  2. Procfile +1 -0
  3. README copy.md +19 -0
  4. backend.py +86 -0
  5. framework.png +0 -0
  6. main.py +89 -0
  7. requirements.txt +7 -0
  8. setup.sh +13 -0
CONTRIBUTION.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Arxiv2latex Contribution Guide
2
+ Thank you for having an interest in contributing Arxiv2latex
3
+
4
+ Here are some guides for you to how to contribute
5
+
6
+ ## How to make PR
7
+ Pull requests are welcome.
8
+ 1. fork this repo into yours
9
+ 2. make changes and push to your repo
10
+ 3. send pull request from your develop branch to this develop branch
11
+ This is only way to give pull request to this repo. Thank you
Procfile ADDED
@@ -0,0 +1 @@
 
 
1
+ web: sh setup.sh && streamlit run main.py
README copy.md ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://arxiv2latex.herokuapp.com/)
2
+
3
+ ![arXiv2Latex logo - an arrow pointing from "arXiv" to "Latex"](/framework.png)
4
+
5
+ ## NEWS
6
+
7
+ - Please deploy this app locally since the online service will end on April 1, 2023, due to insufficiency of the fund.
8
+
9
+ ## About arXiv2Latex
10
+
11
+ - Download the source latex code of multiple arxiv papers with one click.
12
+
13
+ ## Motivation behind
14
+
15
+ To facilitate our paper writing, we may usually need to download and copy the latex source code of other's papers. However, this process can become very tedious if there are many papers to download. So I develop this tool to automate this process. I implement it simply with `request` and `tarfile`. To make it an online free service, I push this project online with the help of `Streamlight` and `Heroku app`. Here is the demo app link: https://arxiv2latex.herokuapp.com/.
16
+
17
+ ## Contribution
18
+
19
+ Please read the [Contribution Guide](CONTRIBUTION.md). Also, feel free to contact me for discussion ([email protected]).
backend.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tarfile
2
+ import os
3
+ import requests
4
+ import datetime
5
+ import pandas as pd
6
+ import shutil
7
+ from bs4 import BeautifulSoup
8
+ from tqdm import tqdm
9
+ import base64
10
+
11
+ def ToBase64(file):
12
+ with open(file, 'rb') as fileObj:
13
+ data = fileObj.read()
14
+ base64_data = base64.b64encode(data)
15
+ return base64_data
16
+
17
+ def archive_dir(dir_name,output_filename,format="zip"):
18
+ shutil.make_archive(output_filename, format, dir_name)
19
+ return output_filename+".zip"
20
+
21
+ def make_dir_if_not_exist(folder):
22
+ if not os.path.exists(folder):
23
+ os.makedirs(folder)
24
+
25
+ def untar(fname, dirs):
26
+ """
27
+ 解压tar.gz文件
28
+ :param fname: 压缩文件名
29
+ :param dirs: 解压后的存放路径
30
+ :return: bool
31
+ """
32
+
33
+ try:
34
+ t = tarfile.open(fname)
35
+ t.extractall(path = dirs)
36
+ return True
37
+ except Exception as e:
38
+ print(e)
39
+ return False
40
+
41
+ def get_timestamp():
42
+ ts = pd.to_datetime(str(datetime.datetime.now()))
43
+ d = ts.strftime('%Y%m%d%H%M%S')
44
+ return d
45
+
46
+ def get_name_from_arvix(url):
47
+ res = BeautifulSoup(requests.get(url).content, 'lxml').find("h1",attrs={"class":"title mathjax"})
48
+ if res is None:
49
+ return ''
50
+ title = res.text[6:].replace(" ","-")
51
+ return title
52
+
53
+ def download_source(pdf_lists=None,output_base=None,project_name=None,fetch_title=True, return_source=False):
54
+ base=output_base
55
+ project_name = project_name + get_timestamp()
56
+ base = os.path.join(base,project_name)
57
+ make_dir_if_not_exist(base)
58
+
59
+ for pdf_link in tqdm(pdf_lists):
60
+ file_stamp = pdf_link.split("/")[-1]
61
+ if fetch_title:
62
+ title = get_name_from_arvix(pdf_link)
63
+ if len(title )== 0:
64
+ continue
65
+ else:
66
+ import numpy as np
67
+ title = file_stamp
68
+ source_link = "https://arxiv.org/e-print/"+file_stamp
69
+ inp = os.path.join(base,'input')
70
+ make_dir_if_not_exist(inp)
71
+ out = os.path.join(base,'output')
72
+ make_dir_if_not_exist(out)
73
+ if return_source:
74
+ print(source_link)
75
+ continue
76
+ response = requests.get(source_link)
77
+ filename = file_stamp+".tar.gz"
78
+ filepath = os.path.join(inp,filename)
79
+ open(filepath, "wb").write(response.content)
80
+ outpath = os.path.join(out,title)
81
+ untar(filepath,outpath)
82
+ archive_dir(out,os.path.join(base,project_name))
83
+
84
+ if __name__ == '__main__':
85
+ s = get_timestamp()
86
+ print(s)
framework.png ADDED
main.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import imp
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ import time
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ import plotly.figure_factory as ff
9
+ import altair as alt
10
+ from PIL import Image
11
+ import base64
12
+ import tarfile
13
+ import os
14
+ import requests
15
+ from backend import *
16
+
17
+ st.set_page_config(page_title="arXiv2Latex Downloader", page_icon=":page_with_curl:", layout="wide", initial_sidebar_state="expanded", menu_items={
18
+ "About": "Download the source latex code of multiple arXiv paper with one click"
19
+ })
20
+
21
+ # title
22
+ st.title("arXiv2Latex Downloader")
23
+
24
+ # input arxiv links to download
25
+ pdf_links_input = st.text_area("Please input the paper links you want to download following the format (Currently supports up to 10 links).", "")
26
+ st.markdown("""
27
+ Input example:
28
+ ```Plain Text
29
+ https://arxiv.org/abs/1512.03385
30
+ https://arxiv.org/abs/1706.03762
31
+ https://arxiv.org/abs/2009.09724
32
+ """)
33
+ ## one click download
34
+ crawling_or_not = st.button("Crawling the latex Code")
35
+ if crawling_or_not:
36
+ print("Crawling...")
37
+ pdf_lists = pdf_links_input.split("\n")
38
+ print(pdf_lists)
39
+ # cleaning the pdf lists
40
+ pdf_lists = [i.strip() for i in pdf_lists if len(i) > 0]
41
+ # TODO: limit the number of paper up to 10 since I am not sure that whether base64 support large file download
42
+ try:
43
+ if len(pdf_lists) > 10:
44
+ st.warning("Currently only support up to 10 papers. Please input less than 10 papers.")
45
+ else:
46
+ # parsing
47
+ base='./download/'
48
+ project_name = get_timestamp().replace(" ","-")
49
+ base = os.path.join(base,project_name)
50
+ make_dir_if_not_exist(base)
51
+
52
+ # st.write(download_status)
53
+ with st.spinner("Downloading papers..."):
54
+ # progress bar
55
+ bar = st.progress(0)
56
+ download_status = st.empty()
57
+ N = len(pdf_lists)
58
+ for i, pdf_link in tqdm(enumerate(pdf_lists)):
59
+ title = get_name_from_arvix(pdf_link)
60
+ file_stamp = pdf_link.split("/")[-1]
61
+ source_link = "https://arxiv.org/e-print/"+file_stamp
62
+ inp = os.path.join(base,'input')
63
+ make_dir_if_not_exist(inp)
64
+ out = os.path.join(base,'output')
65
+ make_dir_if_not_exist(out)
66
+ response = requests.get(source_link)
67
+ filename = file_stamp+".tar.gz"
68
+ filepath = os.path.join(inp,filename)
69
+ open(filepath, "wb").write(response.content)
70
+ outpath = os.path.join(out,title)
71
+ untar(filepath,outpath)
72
+
73
+ # finish one paper
74
+ bar.progress((i+1)/N)
75
+ download_status.text(f"Iteration [{i+1}/{N}]: Finish Downloading of "+title)
76
+
77
+ with st.spinner("Archiving as Zip Files..."):
78
+ # save it as zip file
79
+ filepath = archive_dir(out,os.path.join(base,project_name))
80
+
81
+ # download
82
+ b64 = ToBase64(filepath).decode()
83
+ href = f"<a href='data:file/csv;base64,{b64}' download='arxiv2latex-output-{datetime.datetime.now()}.zip' color='red'>Click here to Download the Output Latex Zip Files</a>"
84
+ st.markdown(href, unsafe_allow_html=True)
85
+
86
+ # 状态
87
+ st.success("Finished")
88
+ except Exception as e:
89
+ st.error("Something goes wrong. Please check the input or concat me to fix this bug. Error message: \n"+str(e))
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ matplotlib
3
+ seaborn
4
+ plotly
5
+ tqdm
6
+ bs4
7
+ lxml
setup.sh ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mkdir -p ~/.streamlit/
2
+
3
+ echo "\
4
+ [general]\n\
5
+ email = \"[email protected]\"\n\
6
+ " > ~/.streamlit/credentials.toml
7
+
8
+ echo "\
9
+ [server]\n\
10
+ headless = true\n\
11
+ enableCORS=false\n\
12
+ port = $PORT\n\
13
+ " > ~/.streamlit/config.toml