# import imp import streamlit as st import pandas as pd import numpy as np import time # import matplotlib.pyplot as plt # import seaborn as sns # import plotly.figure_factory as ff import altair as alt # from PIL import Image import base64 import tarfile import os import requests from backend import * predefined_limits = 10000 st.set_page_config(page_title="arXiv2Latex Downloader", page_icon=":page_with_curl:", layout="wide", initial_sidebar_state="expanded", menu_items={ "About": "Download the source latex code of multiple arXiv paper with one click" }) # title st.title("arXiv2Latex Downloader") # input arxiv links to download pdf_links_input = st.text_area("Please input the paper links you want to download following the format (Currently supports up to 10 links).", "") st.markdown(""" Input example: ```Plain Text https://arxiv.org/abs/1512.03385 https://arxiv.org/abs/1706.03762 https://arxiv.org/abs/2009.09724 """) ## one click download crawling_or_not = st.button("Crawling the latex Code") if crawling_or_not: print("Crawling...") pdf_lists = pdf_links_input.split("\n") print(pdf_lists) # cleaning the pdf lists pdf_lists = [i.strip() for i in pdf_lists if len(i) > 0] # TODO: limit the number of paper up to 10 since I am not sure that whether base64 support large file download # try: if len(pdf_lists) > predefined_limits: st.warning(f"Currently only support up to {predefined_limits} papers. Please input less than {predefined_limits} papers.") else: # parsing base='./download/' project_name = get_timestamp().replace(" ","-") base = os.path.join(base,project_name) make_dir_if_not_exist(base) # st.write(download_status) with st.spinner("Downloading papers..."): # progress bar bar = st.progress(0) download_status = st.empty() N = len(pdf_lists) for i, pdf_link in tqdm(enumerate(pdf_lists)): title = get_name_from_arvix(pdf_link) file_stamp = pdf_link.split("/")[-1] source_link = "https://arxiv.org/e-print/"+file_stamp inp = os.path.join(base,'input') make_dir_if_not_exist(inp) out = os.path.join(base,'output') make_dir_if_not_exist(out) response = requests.get(source_link) filename = file_stamp+".tar.gz" filepath = os.path.join(inp,filename) open(filepath, "wb").write(response.content) outpath = os.path.join(out,title) untar(filepath,outpath) # finish one paper bar.progress((i+1)/N) download_status.text(f"Iteration [{i+1}/{N}]: Finish Downloading of "+title) with st.spinner("Archiving as Zip Files..."): # save it as zip file filepath = archive_dir(out,os.path.join(base,project_name)) # download b64 = ToBase64(filepath).decode() href = f"Click here to Download the Output Latex Zip Files" st.markdown(href, unsafe_allow_html=True) # 状态 st.success("Finished") # except Exception as e: # st.error("Something goes wrong. Please check the input or concat me to fix this bug. Error message: \n"+str(e))