lhoestq's picture
lhoestq HF staff
minor
0d3455a
import os
from uuid import uuid4
import duckdb
import gradio as gr
import pandas as pd
import requests
from duckdb import DuckDBPyConnection, DuckDBPyRelation
from duckdb.typing import DuckDBPyType
from huggingface_hub import HfApi
Connection = DuckDBPyConnection
Table = DuckDBPyRelation
Dtype = DuckDBPyType
READ_PARQUET_FUNCTIONS = ("dd.read_parquet", "pd.read_parquet")
memory_con = duckdb.connect(":memory:")
empty_tbl = memory_con.sql("SELECT null as col_1, null as col_2, null as col_3, null as col_4 FROM range(10)")
PAGE_SIZE = 5
NUM_TRENDING_DATASETS = 10
NUM_USER_DATASETS = 10
SESSIONS_DIR = "s"
SESSION_DIR_SPACES_SYMLINK = os.path.join("/data", SESSIONS_DIR)
SPACE_ID = os.getenv("SPACE_ID")
SPACE_HOST = os.getenv("SPACE_HOST")
APP_URL = f"https://huggingface.co/spaces/{SPACE_ID}" if SPACE_ID else "http://127.0.0.1:7870"
HOST_URL = f"https://{SPACE_HOST}" if SPACE_HOST else "http://127.0.0.1:7870"
css = """
.transparent-dropdown, .transparent-dropdown .container .wrap, .transparent-accordion {
background: var(--body-background-fill);
}
.gradio-container {
padding: var(--size-4) 0 !important;
max-width: 98% !important;
}
.cell-menu-button {
z-index: -1;
}
.centered {
text-align: center;
}
"""
def to_json_df(con: Connection, tbl: Table) -> pd.DataFrame:
query = ", ".join("nullif(([" + col + "]::JSON)[0]::VARCHAR, 'null') AS " + col for col in tbl.columns)
out = con.sql(f"SELECT {query} FROM tbl").df()
return out
def from_json_df(con: Connection, df: pd.DataFrame, columns: list[str], dtypes: list[Dtype]) -> Table:
query = ", ".join(
"if(" + col + " IS null, null, "
+ (
"trim(" + col + """::JSON::VARCHAR, '"')"""
if str(dtype) == "VARCHAR"
else "(" + col + "::JSON::" + str(dtype) + ")"
) # remove double quotes at the start and end
+ ") AS " + col for col, dtype in zip(columns, dtypes))
return con.sql(f"SELECT {query} FROM df")
def setup_edits(con: Connection, dataset: str, pattern: str) -> None:
con.sql(f"CREATE VIEW IF NOT EXISTS dataset AS SELECT * FROM 'hf://datasets/{dataset}/{pattern}'")
empty_dataset_tbl = con.sql("SELECT * FROM dataset LIMIT 0;")
columns = empty_dataset_tbl.columns
dtypes = empty_dataset_tbl.dtypes
con.sql(f"CREATE TABLE IF NOT EXISTS edits(rowid INTEGER PRIMARY KEY, {', '.join(col + ' ' + str(dtype) for col, dtype in zip(columns, dtypes))})")
con.sql(
"CREATE VIEW IF NOT EXISTS edited_dataset AS "
"WITH edits_per_rowid AS (SELECT * FROM (SELECT unnest(range(max(rowid) + 1)) AS rowid FROM edits) LEFT JOIN edits USING (rowid) ORDER BY rowid) "
f"SELECT {', '.join('ifnull(edits_per_rowid.' + col + ', dataset.' + col + ') AS ' + col for col in columns)} FROM dataset POSITIONAL JOIN edits_per_rowid"
)
with gr.Blocks(css=css) as demo:
session_state = gr.BrowserState()
loading_codes_json = gr.JSON([], visible=False)
with gr.Row():
with gr.Column():
gr.Markdown("# πŸ€— Hugging Face Dataset Spreadsheets πŸ“\n\nEdit any Parquet dataset on Hugging Face (full list <a href='https://huggingface.co/datasets' target='_blank'>here</a>)", elem_classes="centered")
with gr.Group():
with gr.Tab("Select Dataset"):
with gr.Row():
dataset_dropdown = gr.Dropdown(label="Dataset", allow_custom_value=True, scale=10)
subset_dropdown = gr.Dropdown(info="Subset", allow_custom_value=True, show_label=False, visible=False)
split_dropdown = gr.Dropdown(info="Split", allow_custom_value=True, show_label=False, visible=False)
with gr.Tab("Share Link"):
share_link_textbox = gr.Textbox(label="Copy the link to the Spreadsheet:", show_copy_button=True, interactive=False)
with gr.Tab("Use Locally"):
use_locally_markdown = gr.Markdown()
dataframe = gr.DataFrame(to_json_df(memory_con, empty_tbl), interactive=True, wrap=True)
with gr.Row():
prev_button = gr.Button("< Previous", min_width=140, interactive=False)
with gr.Column(scale=9, min_width=0):
page_html = gr.HTML("Page 1", elem_classes="centered")
next_button = gr.Button("Next >", min_width=140)
def show_subset_dropdown(dataset: str):
if dataset and "/" not in dataset.strip().strip("/"):
return []
resp = requests.get(f"/static-proxy?url=https%3A%2F%2Fdatasets-server.huggingface.co%2Fcompatible-libraries%3Fdataset%3D%3Cspan class="hljs-subst">{dataset}", timeout=3).json()
loading_codes = ([lib["loading_codes"] for lib in resp.get("libraries", []) if lib["function"] in READ_PARQUET_FUNCTIONS] or [[]])[0] or []
subsets = [loading_code["config_name"] for loading_code in loading_codes]
subset = (subsets or [""])[0]
return dict(choices=subsets, value=subset, visible=len(subsets) > 1, key=hash(str(loading_codes))), loading_codes
def show_split_dropdown(subset: str, loading_codes: list[dict]):
splits = ([list(loading_code["arguments"]["splits"]) for loading_code in loading_codes if loading_code["config_name"] == subset] or [[]])[0]
split = (splits or [""])[0]
return dict(choices=splits, value=split, visible=len(splits) > 1, key=hash(str(loading_codes) + subset))
def show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page: int):
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
if session and dataset and subset and split and pattern:
duckdb_file = session + ".duckdb"
con = duckdb.connect(os.path.join(SESSIONS_DIR, duckdb_file))
setup_edits(con, dataset, pattern)
# Uncomment to have one edit for testing
# con.sql("INSERT OR REPLACE INTO edits SELECT 2 AS rowid, * FROM dataset LIMIT 1")
tbl = con.sql(f"SELECT * FROM edited_dataset LIMIT {PAGE_SIZE} OFFSET {(page - 1) * PAGE_SIZE}")
return dict(value=to_json_df(con, tbl))
else:
return dict(value=to_json_df(memory_con, empty_tbl))
@demo.load(inputs=session_state, outputs=[dataset_dropdown, loading_codes_json, subset_dropdown, split_dropdown, dataframe, session_state, share_link_textbox, use_locally_markdown, prev_button, next_button, page_html])
def _fetch_datasets(session: str | None, request: gr.Request):
datasets = ["CohereForAI/Global-MMLU"]
datasets += [ds.id for ds in HfApi().list_datasets(limit=NUM_TRENDING_DATASETS, sort="trendingScore", direction=-1, filter=["format:parquet"]) if ds.id not in datasets]
session = request.query_params.get(SESSIONS_DIR) or session
if session:
namespace, dataset_name, subset, split, _ = session.split("--")
dataset = namespace + "/" + dataset_name
if "dataset" in request.query_params and request.query_params["dataset"] != dataset:
session = None
dataset = request.query_params["dataset"]
else:
dataset = request.query_params.get("dataset") or datasets[0]
subsets, loading_codes = show_subset_dropdown(dataset)
subsets["value"] = subset if session else subsets["value"]
splits = show_split_dropdown(subsets["value"], loading_codes)
splits["value"] = split if session else splits["value"]
session = session if isinstance(session, str) else f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
page = 1
input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session, page)
return {
dataset_dropdown: gr.Dropdown(choices=datasets, value=dataset),
loading_codes_json: loading_codes,
subset_dropdown: gr.Dropdown(**subsets),
split_dropdown: gr.Dropdown(**splits),
session_state: session,
dataframe: gr.DataFrame(**input_dataframe),
share_link_textbox: f"{APP_URL}?{SESSIONS_DIR}={session}",
use_locally_markdown: (
f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
),
prev_button: gr.Button(interactive=False),
next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
page_html: f"Page {page}",
}
@dataset_dropdown.select(inputs=[session_state, dataset_dropdown], outputs=[session_state, loading_codes_json, subset_dropdown, split_dropdown, dataframe, share_link_textbox, use_locally_markdown, prev_button, next_button, page_html])
def _show_subset_dropdown(session: str | None, dataset: str):
subsets, loading_codes = show_subset_dropdown(dataset)
splits = show_split_dropdown(subsets["value"], loading_codes)
session = f"{dataset.replace('/', '--')}--{subsets['value']}--{splits['value']}--{uuid4()}"
page = 1
input_dataframe = show_input_dataframe(dataset, subsets["value"], splits["value"], loading_codes, session, page)
return {
loading_codes_json: loading_codes,
subset_dropdown: gr.Dropdown(**subsets),
split_dropdown: gr.Dropdown(**splits),
session_state: session,
dataframe: gr.DataFrame(**input_dataframe),
share_link_textbox: f"{APP_URL}?{SESSIONS_DIR}={session}",
use_locally_markdown: (
f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
),
prev_button: gr.Button(interactive=False),
next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
page_html: f"Page {page}",
}
@subset_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, loading_codes_json], outputs=[session_state, split_dropdown, dataframe, share_link_textbox, use_locally_markdown, prev_button, next_button, page_html])
def _show_split_dropdown(dataset: str, subset: str, loading_codes: list[dict]):
splits = show_split_dropdown(subset, loading_codes)
session = f"{dataset.replace('/', '--')}--{subset}--{splits['value']}--{uuid4()}"
page = 1
input_dataframe = show_input_dataframe(dataset, subset, splits["value"], loading_codes, session, page)
return {
split_dropdown: gr.Dropdown(**splits),
session_state: session,
dataframe: gr.DataFrame(**input_dataframe),
share_link_textbox: f"{APP_URL}?{SESSIONS_DIR}={session}",
use_locally_markdown: (
f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
),
prev_button: gr.Button(interactive=False),
next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
page_html: f"Page {page}",
}
@split_dropdown.select(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json], outputs=[session_state, dataframe, share_link_textbox, use_locally_markdown, prev_button, next_button, page_html])
def _show_input_dataframe(dataset: str, subset: str, split: str, loading_codes: list[dict]) -> pd.DataFrame:
session = f"{dataset.replace('/', '--')}--{subset}--{split}--{uuid4()}"
page = 1
input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page)
return {
session_state: session,
dataframe: gr.DataFrame(**input_dataframe),
share_link_textbox: f"{APP_URL}?{SESSIONS_DIR}={session}",
use_locally_markdown: (
f"""In DuckDB:\n\n```sql\nATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb AS db';\nUSE db;\nSELECT * FROM edited_dataset LIMIT 5;\n```\n\n"""
f"""In Python:\n\n```python\nimport duckdb\n\nduckdb.sql("ATTACH '{HOST_URL}/gradio_api/file={SESSIONS_DIR}/{session}.duckdb' AS db")\nduckdb.sql("USE db")\ndf = duckdb.sql("SELECT * FROM edited_dataset LIMIT 5").df()\n```"""
),
prev_button: gr.Button(interactive=False),
next_button: gr.Button(elem_classes="", interactive=True) if len(input_dataframe["value"]) >= PAGE_SIZE else gr.Button(interactive=False),
page_html: f"Page {page}",
}
@next_button.click(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, session_state, page_html], outputs=[dataframe, prev_button, next_button, page_html])
def _show_next_page(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page_str: str) -> pd.DataFrame:
page = int(page_str.split(" ")[-1]) + 1
input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page)
return {
dataframe: gr.DataFrame(**input_dataframe),
prev_button: gr.Button(elem_classes="", interactive=True),
page_html: f"Page {page}",
}
@prev_button.click(inputs=[dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, session_state, page_html], outputs=[dataframe, prev_button, next_button, page_html])
def _show_prev_page(dataset: str, subset: str, split: str, loading_codes: list[dict], session: str, page_str: str) -> pd.DataFrame:
page = int(page_str.split(" ")[-1]) - 1
input_dataframe = show_input_dataframe(dataset, subset, split, loading_codes, session, page)
return {
dataframe: gr.DataFrame(**input_dataframe),
prev_button: gr.Button(interactive=False) if page == 1 else gr.Button(elem_classes="", interactive=True),
page_html: f"Page {page}",
}
@dataframe.input(inputs=[dataframe, session_state, dataset_dropdown, subset_dropdown, split_dropdown, loading_codes_json, page_html])
def _dataframe_input(df: pd.DataFrame, session: str | None, dataset: str, subset: str, split: str, loading_codes: list[dict], page_str: str):
pattern = ([loading_code["arguments"]["splits"][split] for loading_code in loading_codes if loading_code["config_name"] == subset] or [None])[0]
if session and dataset and subset and split and pattern:
duckdb_file = session + ".duckdb"
con = duckdb.connect(os.path.join(SESSIONS_DIR, duckdb_file))
setup_edits(con, dataset, pattern)
empty_dataset_tbl = con.sql("SELECT * EXCLUDE (rowid) FROM edits LIMIT 0;")
columns = empty_dataset_tbl.columns
dtypes = empty_dataset_tbl.dtypes
tbl = from_json_df(con, df, columns=columns, dtypes=dtypes)
page = int(page_str.split(" ")[-1])
# TODO add edits for page > 1
# Note: Here we don't use INSERT OR REPLACE because of Not implemented Error: List Update is not supported.
con.sql(f"DELETE FROM edits WHERE rowid IN range({(page - 1) * PAGE_SIZE}, {page * PAGE_SIZE})")
try:
con.sql(f"INSERT INTO edits SELECT * FROM (SELECT unnest(range({(page - 1) * PAGE_SIZE}, {page * PAGE_SIZE})) AS rowid) POSITIONAL JOIN tbl")
except duckdb.ConversionException as e:
raise gr.Error(str(e).split('\n')[0], title="duckdb.ConversionException")
print(f"Saved {dataset} edits")
if __name__ == "__main__":
if SPACE_ID:
os.makedirs(SESSION_DIR_SPACES_SYMLINK, exist_ok=True)
try:
os.symlink(SESSION_DIR_SPACES_SYMLINK, SESSIONS_DIR, target_is_directory=True)
except FileExistsError:
pass
else:
os.makedirs(SESSIONS_DIR, exist_ok=True)
gr.set_static_paths(paths=[SESSIONS_DIR])
demo.launch()