Spaces:
Sleeping
Sleeping
import requests | |
import duckdb | |
DATASET_VIEWER_API_URL = "/static-proxy?url=https%3A%2F%2Fdatasets-server.huggingface.co%2F%26quot%3B%3C%2Fspan%3E%3C!-- HTML_TAG_END --> | |
session = requests.Session() | |
def fetch_json(url, params=None, timeout=20): | |
response = session.get(url, params=params, timeout=timeout) | |
response.raise_for_status() | |
data = response.json() | |
if "error" in data: | |
raise Exception(f"Error fetching data: {data['error']}") | |
return data | |
def get_split_rows(dataset, config, split): | |
url = f"{DATASET_VIEWER_API_URL}/size" | |
params = {"dataset": dataset, "config": config} | |
config_size = fetch_json(url, params) | |
split_size = next( | |
(s for s in config_size["size"]["splits"] if s["split"] == split), None | |
) | |
if split_size is None: | |
raise Exception(f"Error fetching split {split} in config {config}") | |
return split_size["num_rows"] | |
def get_parquet_urls(dataset, config, split): | |
url = f"{DATASET_VIEWER_API_URL}/parquet" | |
params = {"dataset": dataset, "config": config, "split": split} | |
parquet_files = fetch_json(url, params) | |
parquet_urls = [file["url"] for file in parquet_files["parquet_files"]] | |
return ",".join(f"'{url}'" for url in parquet_urls) | |
def get_docs_from_parquet(parquet_urls, column, offset, limit): | |
sql_query = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};" | |
df = duckdb.sql(sql_query).to_df() | |
return df[column].tolist() | |
def get_info(dataset): | |
url = f"{DATASET_VIEWER_API_URL}/info" | |
params = {"dataset": dataset} | |
info_resp = fetch_json(url, params) | |
return info_resp["dataset_info"] | |