Spaces:
Running
Running
File size: 1,499 Bytes
0043c9e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import sys
import urllib
import requests
from bs4 import BeautifulSoup
import re
import zipfile
def get_zip_urls(base="https://www.irs.gov/downloads/irm", start_page=1, max_page=74):
urls = []
for page_num in range(start_page, max_page + 1):
url = f"{base}?page={page_num}"
response = requests.get(url)
html_content = response.text
soup = BeautifulSoup(html_content, "html.parser")
for link in soup.find_all("a", href=re.compile(r"\.zip$")):
urls.append(link.get("href"))
return urls
def download_and_unzip(urls, unzip_dir):
for zip_url in urls[:10]:
filename = zip_url.split("/")[-1]
urllib.request.urlretrieve(zip_url, filename)
with zipfile.ZipFile(filename, "r") as zip_ref:
for file_info in zip_ref.infolist():
# check if the file has a PDF extension
if file_info.filename.lower().endswith(".pdf"):
# extract the file to the PDF directory
zip_ref.extract(file_info, unzip_dir)
if __name__ == "__main__":
base_url = sys.argv[1]
page_start = int(sys.argv[2])
page_max = int(sys.argv[3])
pdf_dir = sys.argv[4]
print(f"Grabbing zip urls from {base_url}")
zip_urls = get_zip_urls(base_url, page_start, page_max)
print(
f"Found {len(zip_urls)} zip urls, downloading and unzipping pdfs into {pdf_dir}"
)
download_and_unzip(zip_urls, pdf_dir)
print(f"Finished unzipping")
|