hf-public-data-insights / python /0_download_files.py
Xianbao QIAN
query tables and create example
9279ca3
raw
history blame
3.99 kB
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import duckdb
import random
import argparse
import yaml
# Create the "tables" folders if they don't exist
os.makedirs("tables", exist_ok=True)
# URLs of the files to download
urls = [
"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet?download=true",
"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/datasets.parquet?download=true",
"https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/spaces.parquet?download=true"
]
def download_file(url, overwrite=True):
filename = os.path.join("tables", url.split("/")[-1].split("?")[0])
if not overwrite and os.path.exists(filename):
print(f"File already exists: {filename}. Skipping download.")
return
response = requests.get(url, stream=True)
total_size = int(response.headers.get("Content-Length", 0))
block_size = 1024 # 1 KB
with open(filename, "wb") as file, tqdm(
desc=filename,
total=total_size,
unit="iB",
unit_scale=True,
unit_divisor=1024,
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]"
) as progress_bar:
for data in response.iter_content(block_size):
size = file.write(data)
progress_bar.update(size)
print(f"Downloaded: {filename}")
def main(overwrite):
# Create a ThreadPoolExecutor with max_workers set to 3 (number of files to download)
with ThreadPoolExecutor(max_workers=3) as executor:
# Submit download tasks to the executor
futures = [executor.submit(download_file, url, overwrite) for url in urls]
# Wait for all tasks to complete
for future in futures:
future.result()
print("All files downloaded successfully.")
# Process each downloaded Parquet file
for url in urls:
filename = os.path.join("tables", url.split("/")[-1].split("?")[0])
table_name = os.path.splitext(os.path.basename(filename))[0]
# Connect to the Parquet file using DuckDB
con = duckdb.connect(database=':memory:')
con.execute(f"CREATE VIEW {table_name} AS SELECT * FROM parquet_scan('{filename}')")
# Retrieve the table structure
table_structure = con.execute(f"DESCRIBE {table_name}").fetchall()
# Generate the YAML content
yaml_content = f"{table_name}:\n"
yaml_content += " table_structure:\n"
for row in table_structure:
column, dtype = row[:2] # Unpack only the first two values
yaml_content += f" - column: {column}\n"
yaml_content += f" type: {dtype}\n"
# Retrieve 10 random items from the table
con.execute(f"CREATE VIEW {table_name}_random AS SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 10")
random_items = con.execute(f"SELECT * FROM {table_name}_random").fetchall()
yaml_content += " random_items:\n"
for item in random_items:
yaml_content += " - "
for column, value in zip([row[0] for row in table_structure], item):
yaml_content += f"{column}: {value}\n "
yaml_content = yaml_content.rstrip() # Remove trailing spaces
yaml_content += "\n"
# Save the YAML content to a file in the "tables" folder
yaml_file = os.path.join("tables", f"{table_name}.example.yaml")
with open(yaml_file, "w") as file:
file.write(yaml_content)
print(f"Generated: {yaml_file}")
print("Example files generated successfully.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download and process Parquet files.")
parser.add_argument("--no-overwrite", action="store_true", help="Skip downloading files that already exist.")
args = parser.parse_args()
main(overwrite=not args.no_overwrite)