Spaces:

yixin6178
/

arXiv2Latex

Running

App Files Files Community

yiyixin commited on May 24, 2023

Commit

6be34e2

1 Parent(s): 40d70d9

upload

Browse files

Files changed (8) hide show

CONTRIBUTION.md +11 -0
Procfile +1 -0
README copy.md +19 -0
backend.py +86 -0
framework.png +0 -0
main.py +89 -0
requirements.txt +7 -0
setup.sh +13 -0

CONTRIBUTION.md ADDED Viewed

	@@ -0,0 +1,11 @@

+## Arxiv2latex Contribution Guide
+Thank you for having an interest in contributing Arxiv2latex
+Here are some guides for you to how to contribute
+## How to make PR
+Pull requests are welcome.
+1. fork this repo into yours
+2. make changes and push to your repo
+3. send pull request from your develop branch to this develop branch
+This is only way to give pull request to this repo. Thank you

Procfile ADDED Viewed

	@@ -0,0 +1 @@


1	+ web: sh setup.sh && streamlit run main.py

README copy.md ADDED Viewed

	@@ -0,0 +1,19 @@

+[![Streamlit App](https://static.streamlit.io/badges/streamlit_badge_black_white.svg)](https://arxiv2latex.herokuapp.com/)
+![arXiv2Latex logo - an arrow pointing from "arXiv" to "Latex"](/framework.png)
+## NEWS
+- Please deploy this app locally since the online service will end on April 1, 2023, due to insufficiency of the fund.
+## About arXiv2Latex
+- Download the source latex code of multiple arxiv papers with one click.
+## Motivation behind
+To facilitate our paper writing, we may usually need to download and copy the latex source code of other's papers. However, this process can become very tedious if there are many papers to download. So I develop this tool to automate this process. I implement it simply with `request` and `tarfile`. To make it an online free service, I push this project online with the help of `Streamlight` and `Heroku app`. Here is the demo app link: https://arxiv2latex.herokuapp.com/.
+## Contribution
+Please read the [Contribution Guide](CONTRIBUTION.md). Also, feel free to contact me for discussion ([email protected]).

backend.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import tarfile
+import os
+import requests
+import datetime
+import pandas as pd
+import shutil
+from bs4 import  BeautifulSoup
+from tqdm import tqdm
+import base64
+def ToBase64(file):
+    with open(file, 'rb') as fileObj:
+        data = fileObj.read()
+    base64_data = base64.b64encode(data)
+    return base64_data
+def archive_dir(dir_name,output_filename,format="zip"):
+    shutil.make_archive(output_filename, format, dir_name)
+    return output_filename+".zip"
+def make_dir_if_not_exist(folder):
+  if not os.path.exists(folder):
+    os.makedirs(folder)
+def untar(fname, dirs):
+    """
+    解压tar.gz文件
+    :param fname: 压缩文件名
+    :param dirs: 解压后的存放路径
+    :return: bool
+    """
+    try:
+        t = tarfile.open(fname)
+        t.extractall(path = dirs)
+        return True
+    except Exception as e:
+        print(e)
+        return False
+def get_timestamp():
+    ts = pd.to_datetime(str(datetime.datetime.now()))
+    d = ts.strftime('%Y%m%d%H%M%S')
+    return d
+def get_name_from_arvix(url):
+    res = BeautifulSoup(requests.get(url).content, 'lxml').find("h1",attrs={"class":"title mathjax"})
+    if res is None:
+        return ''
+    title = res.text[6:].replace(" ","-")
+    return title
+def download_source(pdf_lists=None,output_base=None,project_name=None,fetch_title=True, return_source=False):
+    base=output_base
+    project_name = project_name + get_timestamp()
+    base = os.path.join(base,project_name)
+    make_dir_if_not_exist(base)
+    for pdf_link in tqdm(pdf_lists):
+        file_stamp = pdf_link.split("/")[-1]
+        if fetch_title:
+            title = get_name_from_arvix(pdf_link)
+            if len(title )== 0:
+                continue
+        else:
+            import numpy as np
+            title = file_stamp
+        source_link = "https://arxiv.org/e-print/"+file_stamp
+        inp = os.path.join(base,'input')
+        make_dir_if_not_exist(inp)
+        out = os.path.join(base,'output')
+        make_dir_if_not_exist(out)
+        if return_source:
+            print(source_link)
+            continue
+        response = requests.get(source_link)
+        filename = file_stamp+".tar.gz"
+        filepath = os.path.join(inp,filename)
+        open(filepath, "wb").write(response.content)
+        outpath = os.path.join(out,title)
+        untar(filepath,outpath)
+    archive_dir(out,os.path.join(base,project_name))
+if __name__ == '__main__':
+    s = get_timestamp()
+    print(s)

framework.png ADDED Viewed

main.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import imp
+import streamlit as st
+import pandas as pd
+import numpy as np
+import time
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.figure_factory as ff
+import altair as alt
+from PIL import Image
+import base64
+import tarfile
+import os
+import requests
+from backend import *
+st.set_page_config(page_title="arXiv2Latex Downloader", page_icon=":page_with_curl:", layout="wide", initial_sidebar_state="expanded", menu_items={
+    "About": "Download the source latex code of multiple arXiv paper with one click"
+})
+# title
+st.title("arXiv2Latex Downloader")
+# input arxiv links to download
+pdf_links_input = st.text_area("Please input the paper links you want to download following the format (Currently supports up to 10 links).", "")
+st.markdown("""
+            Input example:
+            ```Plain Text
+            https://arxiv.org/abs/1512.03385
+            https://arxiv.org/abs/1706.03762
+            https://arxiv.org/abs/2009.09724
+            """)
+## one click download
+crawling_or_not = st.button("Crawling the latex Code")
+if crawling_or_not:
+    print("Crawling...")
+    pdf_lists = pdf_links_input.split("\n")
+    print(pdf_lists)
+    # cleaning the pdf lists
+    pdf_lists = [i.strip() for i in pdf_lists if len(i) > 0]
+    # TODO: limit the number of paper up to 10 since I am not sure that whether base64 support large file download
+    try:
+        if len(pdf_lists) > 10:
+            st.warning("Currently only support up to 10 papers. Please input less than 10 papers.")
+        else:
+            # parsing
+            base='./download/'
+            project_name = get_timestamp().replace(" ","-")
+            base = os.path.join(base,project_name)
+            make_dir_if_not_exist(base)
+            # st.write(download_status)
+            with st.spinner("Downloading papers..."):
+                # progress bar
+                bar = st.progress(0)
+                download_status = st.empty()
+                N = len(pdf_lists)
+                for i, pdf_link in tqdm(enumerate(pdf_lists)):
+                    title = get_name_from_arvix(pdf_link)
+                    file_stamp = pdf_link.split("/")[-1]
+                    source_link = "https://arxiv.org/e-print/"+file_stamp
+                    inp = os.path.join(base,'input')
+                    make_dir_if_not_exist(inp)
+                    out = os.path.join(base,'output')
+                    make_dir_if_not_exist(out)
+                    response = requests.get(source_link)
+                    filename = file_stamp+".tar.gz"
+                    filepath = os.path.join(inp,filename)
+                    open(filepath, "wb").write(response.content)
+                    outpath = os.path.join(out,title)
+                    untar(filepath,outpath)
+                    # finish one paper
+                    bar.progress((i+1)/N)
+                    download_status.text(f"Iteration [{i+1}/{N}]: Finish Downloading of "+title)
+            with st.spinner("Archiving as Zip Files..."):
+                # save it as zip file
+                filepath = archive_dir(out,os.path.join(base,project_name))
+                # download
+                b64 = ToBase64(filepath).decode()
+            href = f"<a href='data:file/csv;base64,{b64}' download='arxiv2latex-output-{datetime.datetime.now()}.zip' color='red'>Click here to Download the Output Latex Zip Files</a>"
+            st.markdown(href, unsafe_allow_html=True)
+            # 状态
+            st.success("Finished")
+    except Exception as e:
+        st.error("Something goes wrong. Please check the input or concat me to fix this bug. Error message: \n"+str(e))

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit
+matplotlib
+seaborn
+plotly
+tqdm
+bs4
+lxml

setup.sh ADDED Viewed

	@@ -0,0 +1,13 @@

+mkdir -p ~/.streamlit/
+echo "\
+[general]\n\
+email = \"[email protected]\"\n\
+" > ~/.streamlit/credentials.toml
+echo "\
+[server]\n\
+headless = true\n\
+enableCORS=false\n\
+port = $PORT\n\
+" > ~/.streamlit/config.toml