File size: 5,757 Bytes
a66cfba
 
 
 
 
 
 
2b0a4af
a66cfba
 
2b0a4af
 
 
 
 
 
 
 
 
4e37f97
 
 
 
 
b8d2d87
 
 
 
 
 
 
4e37f97
 
 
 
2b0a4af
 
 
 
 
 
 
 
 
 
 
 
 
a66cfba
 
 
 
 
 
 
 
 
4e37f97
 
 
5d05809
4e37f97
5d05809
a66cfba
 
f21399e
 
 
a66cfba
4e37f97
 
5d05809
a66cfba
 
 
 
 
 
4e37f97
 
 
a66cfba
 
 
4e37f97
a66cfba
 
 
 
 
 
 
 
4e37f97
f21399e
 
a66cfba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from typing import List, Dict
import httpx
import gradio as gr
import pandas as pd
import json

async def get_splits(dataset_name: str) -> Dict[str, List[Dict]]:
    URL = f"https://huggingface.co/api/datasets/{dataset_name}"
    async with httpx.AsyncClient() as session:
        response = await session.get(URL)
        dataset_info = response.json()
        return {
            "splits": [
                {"split": split_name, "config": config_name}
                for config_name, config_info in dataset_info.get("config", {}).items()
                for split_name in config_info.get("splits", [])
            ]
        }

async def get_valid_datasets() -> List[str]:
    URL = f"https://huggingface.co/api/datasets"
    async with httpx.AsyncClient() as session:
        response = await session.get(URL)
        try:
            datasets_raw = response.json()
            datasets = []
            for dataset_raw in datasets_raw:
                if isinstance(dataset_raw, str) and "/" in dataset_raw:
                    datasets.append(dataset_raw)
                elif isinstance(dataset_raw, dict) and "/" in dataset_raw.get("id", ""):
                    datasets.append(dataset_raw["id"])
        except (KeyError, json.JSONDecodeError):
            datasets = []  # Set a default value if the response is not in the expected format
        return datasets

async def get_first_rows(dataset: str, config: str, split: str) -> Dict[str, Dict[str, List[Dict]]]:
    URL = f"https://huggingface.co/datasets/{dataset}/resolve/main/dataset_info.json"
    async with httpx.AsyncClient() as session:
        response = await session.get(URL)
        dataset_info = response.json()
        split_info = dataset_info["splits"][split]
        first_rows = {
            "rows": [
                {"row": row} for row in split_info["examples"][:10]
            ]
        }
        return first_rows

def get_df_from_rows(api_output):
    dfFromSort = pd.DataFrame([row["row"] for row in api_output["rows"]])
    try:
        dfFromSort.sort_values(by=1, axis=1, ascending=True, inplace=False, kind='mergesort', na_position='last', ignore_index=False, key=None)
    except:
        print("Exception sorting due to keyerror?")
    return dfFromSort

async def update_configs(dataset_name: str):
    try:
        splits = await get_splits(dataset_name)
        all_configs = sorted(set([s["config"] for s in splits["splits"]]))
        return (all_configs, all_configs[0] if all_configs else None), splits
    except json.JSONDecodeError:
        return [], None, {"splits": []}

async def update_splits(config_name: str, state: gr.State):
    if state["splits"] is None:
        return [], None, gr.DataFrame.update()

    splits_for_config = sorted(set([s["split"] for s in state["splits"] if s["config"] == config_name]))
    dataset_name = state["splits"][0]["dataset"] if state["splits"] else None
    dataset = await update_dataset(splits_for_config[0], config_name, dataset_name) if splits_for_config and dataset_name else gr.DataFrame.update()
    return splits_for_config, splits_for_config[0] if splits_for_config else None, dataset

async def update_dataset(split_name: str, config_name: str, dataset_name: str):
    rows = await get_first_rows(dataset_name, config_name, split_name)
    df = get_df_from_rows(rows)
    return df

async def update_URL(dataset: str, config: str, split: str) -> str:
    URL = f"https://huggingface.co/datasets/{dataset}/tree/main/{config}/{split}"
    return URL
   
async def openurl(URL: str) -> str:
    html = f"<a href={URL} target=_blank>{URL}</a>"
    return html

with gr.Blocks() as demo:
    gr.Markdown("<h1><center>🥫Datasetter📊 Datasets Analyzer and Transformer</center></h1>")
    gr.Markdown("""<div align="center">Curated Datasets: <a href = "https://www.kaggle.com/datasets">Kaggle</a>. <a href="https://www.nlm.nih.gov/research/umls/index.html">NLM UMLS</a>.  <a href="https://loinc.org/downloads/">LOINC</a>. <a href="https://www.cms.gov/medicare/icd-10/2022-icd-10-cm">ICD10 Diagnosis</a>. <a href="https://icd.who.int/dev11/downloads">ICD11</a>.  <a href="https://paperswithcode.com/datasets?q=medical&v=lst&o=newest">Papers,Code,Datasets for SOTA in Medicine</a>.   <a href="https://paperswithcode.com/datasets?q=mental&v=lst&o=newest">Mental</a>.  <a href="https://paperswithcode.com/datasets?q=behavior&v=lst&o=newest">Behavior</a>. <a href="https://www.cms.gov/medicare-coverage-database/downloads/downloads.aspx">CMS Downloads</a>.  <a href="https://www.cms.gov/medicare/fraud-and-abuse/physicianselfreferral/list_of_codes">CMS CPT and HCPCS Procedures and Services</a>  """)

    splits_data = gr.State()
    
    with gr.Row():
        dataset_name = gr.Dropdown(label="Dataset", interactive=True, allow_custom_value=True)
        config = gr.Dropdown(label="Subset", interactive=True, allow_custom_value=True)
        split = gr.Dropdown(label="Split", interactive=True, allow_custom_value=True)
    
    with gr.Row():
        URLcenter = gr.Textbox(label="Dataset URL", placeholder="URL")
        btn = gr.Button("Use Dataset")
        URLoutput = gr.HTML(label="Output")

    with gr.Row():
        dataset = gr.DataFrame(wrap=True, interactive=True)
    
    demo.load(get_valid_datasets, inputs=None, outputs=[dataset_name])
    
    dataset_name.change(update_configs, inputs=[dataset_name], outputs=[config, splits_data])
    config.change(update_splits, inputs=[config, splits_data], outputs=[split, dataset])
    split.change(update_dataset, inputs=[split, config, dataset_name], outputs=[dataset])

    dataset_name.change(update_URL, inputs=[split, config, dataset_name], outputs=[URLcenter])

    btn.click(openurl, [URLcenter], URLoutput)

demo.launch(debug=True)