awacke1's picture
Rename app.py to backup2.app.py
064f8b4 verified
raw
history blame
5.76 kB
from typing import List, Dict
import httpx
import gradio as gr
import pandas as pd
import json
async def get_splits(dataset_name: str) -> Dict[str, List[Dict]]:
URL = f"https://huggingface.co/api/datasets/{dataset_name}"
async with httpx.AsyncClient() as session:
response = await session.get(URL)
dataset_info = response.json()
return {
"splits": [
{"split": split_name, "config": config_name}
for config_name, config_info in dataset_info.get("config", {}).items()
for split_name in config_info.get("splits", [])
]
}
async def get_valid_datasets() -> List[str]:
URL = f"https://huggingface.co/api/datasets"
async with httpx.AsyncClient() as session:
response = await session.get(URL)
try:
datasets_raw = response.json()
datasets = []
for dataset_raw in datasets_raw:
if isinstance(dataset_raw, str) and "/" in dataset_raw:
datasets.append(dataset_raw)
elif isinstance(dataset_raw, dict) and "/" in dataset_raw.get("id", ""):
datasets.append(dataset_raw["id"])
except (KeyError, json.JSONDecodeError):
datasets = [] # Set a default value if the response is not in the expected format
return datasets
async def get_first_rows(dataset: str, config: str, split: str) -> Dict[str, Dict[str, List[Dict]]]:
URL = f"https://huggingface.co/datasets/{dataset}/resolve/main/dataset_info.json"
async with httpx.AsyncClient() as session:
response = await session.get(URL)
dataset_info = response.json()
split_info = dataset_info["splits"][split]
first_rows = {
"rows": [
{"row": row} for row in split_info["examples"][:10]
]
}
return first_rows
def get_df_from_rows(api_output):
dfFromSort = pd.DataFrame([row["row"] for row in api_output["rows"]])
try:
dfFromSort.sort_values(by=1, axis=1, ascending=True, inplace=False, kind='mergesort', na_position='last', ignore_index=False, key=None)
except:
print("Exception sorting due to keyerror?")
return dfFromSort
async def update_configs(dataset_name: str):
try:
splits = await get_splits(dataset_name)
all_configs = sorted(set([s["config"] for s in splits["splits"]]))
return (all_configs, all_configs[0] if all_configs else None), splits
except json.JSONDecodeError:
return [], None, {"splits": []}
async def update_splits(config_name: str, state: gr.State):
if state["splits"] is None:
return [], None, gr.DataFrame.update()
splits_for_config = sorted(set([s["split"] for s in state["splits"] if s["config"] == config_name]))
dataset_name = state["splits"][0]["dataset"] if state["splits"] else None
dataset = await update_dataset(splits_for_config[0], config_name, dataset_name) if splits_for_config and dataset_name else gr.DataFrame.update()
return splits_for_config, splits_for_config[0] if splits_for_config else None, dataset
async def update_dataset(split_name: str, config_name: str, dataset_name: str):
rows = await get_first_rows(dataset_name, config_name, split_name)
df = get_df_from_rows(rows)
return df
async def update_URL(dataset: str, config: str, split: str) -> str:
URL = f"https://huggingface.co/datasets/{dataset}/tree/main/{config}/{split}"
return URL
async def openurl(URL: str) -> str:
html = f"<a href={URL} target=_blank>{URL}</a>"
return html
with gr.Blocks() as demo:
gr.Markdown("<h1><center>🥫Datasetter📊 Datasets Analyzer and Transformer</center></h1>")
gr.Markdown("""<div align="center">Curated Datasets: <a href = "https://www.kaggle.com/datasets">Kaggle</a>. <a href="https://www.nlm.nih.gov/research/umls/index.html">NLM UMLS</a>. <a href="https://loinc.org/downloads/">LOINC</a>. <a href="https://www.cms.gov/medicare/icd-10/2022-icd-10-cm">ICD10 Diagnosis</a>. <a href="https://icd.who.int/dev11/downloads">ICD11</a>. <a href="https://paperswithcode.com/datasets?q=medical&v=lst&o=newest">Papers,Code,Datasets for SOTA in Medicine</a>. <a href="https://paperswithcode.com/datasets?q=mental&v=lst&o=newest">Mental</a>. <a href="https://paperswithcode.com/datasets?q=behavior&v=lst&o=newest">Behavior</a>. <a href="https://www.cms.gov/medicare-coverage-database/downloads/downloads.aspx">CMS Downloads</a>. <a href="https://www.cms.gov/medicare/fraud-and-abuse/physicianselfreferral/list_of_codes">CMS CPT and HCPCS Procedures and Services</a> """)
splits_data = gr.State()
with gr.Row():
dataset_name = gr.Dropdown(label="Dataset", interactive=True, allow_custom_value=True)
config = gr.Dropdown(label="Subset", interactive=True, allow_custom_value=True)
split = gr.Dropdown(label="Split", interactive=True, allow_custom_value=True)
with gr.Row():
URLcenter = gr.Textbox(label="Dataset URL", placeholder="URL")
btn = gr.Button("Use Dataset")
URLoutput = gr.HTML(label="Output")
with gr.Row():
dataset = gr.DataFrame(wrap=True, interactive=True)
demo.load(get_valid_datasets, inputs=None, outputs=[dataset_name])
dataset_name.change(update_configs, inputs=[dataset_name], outputs=[config, splits_data])
config.change(update_splits, inputs=[config, splits_data], outputs=[split, dataset])
split.change(update_dataset, inputs=[split, config, dataset_name], outputs=[dataset])
dataset_name.change(update_URL, inputs=[split, config, dataset_name], outputs=[URLcenter])
btn.click(openurl, [URLcenter], URLoutput)
demo.launch(debug=True)