import os from datetime import datetime, timedelta, timezone from typing import Any, Dict import gradio as gr import pandas as pd from cachetools import TTLCache, cached from dotenv import load_dotenv from httpx import Client from huggingface_hub import DatasetCard, hf_hub_url, list_datasets from tqdm.auto import tqdm from tqdm.contrib.concurrent import thread_map load_dotenv() LIMIT = 5_000 CACHE_TIME = 60 * 60 * 12 # 12 hours REMOVE_ORGS = { "HuggingFaceM4", "HuggingFaceBR4", "open-llm-leaderboard", "TrainingDataPro", } HF_TOKEN = os.getenv("HF_TOKEN") USER_AGENT = os.getenv("USER_AGENT") headers = {"authorization": f"Bearer ${HF_TOKEN}", "user-agent": USER_AGENT} client = Client( headers=headers, timeout=120, ) # LOCAL = False # if platform == "darwin": # LOCAL = True # cache_dir = "cache" if LOCAL else "/data/diskcache" # cache = Cache(cache_dir) cache = TTLCache(maxsize=10, ttl=CACHE_TIME) def get_three_months_ago(): now = datetime.now(timezone.utc) return now - timedelta(days=90) def add_created_data(dataset): _id = dataset._id created = dataset.created_at dataset_dict = dataset.__dict__ dataset_dict["createdAt"] = created return dataset_dict def get_readme_len(dataset: Dict[str, Any]): try: url = hf_hub_url(dataset["id"], "README.md", repo_type="dataset") resp = client.get(url) if resp.status_code == 200: card = DatasetCard(resp.text) dataset["len"] = len(card.text) return dataset except Exception as e: print(e) return None def check_ds_server_valid(id): url = f"https://datasets-server.huggingface.co/is-valid?dataset={id}" response = client.get(url) if response.status_code != 200: return False try: data = response.json() preview = data.get("preview") return preview is not None except Exception as e: print(e) return False def has_server_preview(dataset): dataset["server_preview"] = check_ds_server_valid(dataset["id"]) return dataset def render_model_hub_link(hub_id): link = f"https://huggingface.co/datasets/{hub_id}" return ( f'{hub_id}' ) @cached(cache) def get_datasets(): return list( tqdm( iter( list_datasets(limit=LIMIT, full=True, sort="createdAt", direction=-1) ) ) ) @cached(cache) def load_data(): datasets = get_datasets() datasets = [add_created_data(dataset) for dataset in tqdm(datasets)] # datasets = [dataset.__dict__ for dataset in tqdm(datasets)] filtered = [ds for ds in datasets if ds["createdAt"] > get_three_months_ago()] ds_with_len = thread_map(get_readme_len, filtered) ds_with_len = [ds for ds in ds_with_len if ds is not None] ds_with_valid_status = thread_map(has_server_preview, ds_with_len) ds_with_valid_status = [ds for ds in ds_with_valid_status if ds is not None] return ds_with_valid_status columns_to_drop = [ "cardData", "gated", "sha", "tags", "description", "siblings", "disabled", "_id", "private", "author", # "citation", "lastModified", ] def prep_dataframe(remove_orgs_and_users=REMOVE_ORGS, columns_to_drop=columns_to_drop): ds_with_len = load_data() if remove_orgs_and_users: ds_with_len = [ ds for ds in ds_with_len if ds["author"] not in remove_orgs_and_users ] df = pd.DataFrame(ds_with_len) df["id"] = df["id"].apply(render_model_hub_link) if columns_to_drop: df = df.drop(columns=columns_to_drop) df = df.sort_values(by=["likes", "downloads", "len"], ascending=False) return df def filter_df_by_max_age(df, max_age_days=None): df = df.dropna(subset=["createdAt"]) now = datetime.now(timezone.utc) if max_age_days is not None: max_date = now - timedelta(days=max_age_days) df = df[df["createdAt"] >= max_date] return df def filter_by_readme_len(df, min_len=None): if min_len is not None: df = df[df["len"] >= min_len] return df def filter_df(max_age_days=None, min_len=None, needs_server_preview: bool = False): df = prep_dataframe() if needs_server_preview: df = df[df["server_preview"] == True] if max_age_days is not None: df = filter_df_by_max_age(df, max_age_days=max_age_days) if min_len is not None: df = filter_by_readme_len(df, min_len=min_len) df = df.sort_values(by=["likes", "downloads", "len"], ascending=False) return df with gr.Blocks() as demo: gr.Markdown("# Recent Datasets on the Hub") gr.Markdown( "Datasets added in the past 90 days with a README.md and some metadata." ) with gr.Row(): max_age_days = gr.Slider( label="Max Age (days)", value=7, minimum=0, maximum=90, step=1, interactive=True, ) min_len = gr.Slider( label="Minimum README Length", value=300, minimum=0, maximum=1000, step=50, interactive=True, ) needs_server_preview = gr.Checkbox( label="Exclude datasets without datasets-server preview?", value=False, interactive=True, ) output = gr.DataFrame(filter_df, datatype="markdown", min_width=160 * 2.5, height=1000) max_age_days.input( filter_df, inputs=[max_age_days, min_len, needs_server_preview], outputs=[output], ) min_len.input( filter_df, inputs=[max_age_days, min_len, needs_server_preview], outputs=[output], ) needs_server_preview.change( filter_df, inputs=[max_age_days, min_len, needs_server_preview], outputs=[output], ) demo.launch()