File size: 7,845 Bytes
d211b3d
 
20e02d9
d211b3d
20e02d9
 
d211b3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20e02d9
d211b3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20e02d9
d211b3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20e02d9
 
 
d211b3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4588fcc
d211b3d
 
4588fcc
d211b3d
4588fcc
d211b3d
 
 
 
 
 
 
 
 
 
 
2d96685
d211b3d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20e02d9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import gradio as gr
import pandas as pd
import polars as pl 
import logging


logging.basicConfig(format='%(name)s - %(asctime)s - %(message)s', level=logging.INFO)

logging.info("loading data")
data = pl.read_parquet("hf://datasets/polinaeterna/hub_datasets_string_statistics/data/*.parquet")
logging.info("data loaded")


min_num_examples = data["num_examples"].min()
max_num_examples = data["num_examples"].max()

min_null_count = data["null_count"].min()
max_null_count = data["null_count"].max()

min_null_prop = data["null_proportion"].min()
max_null_prop = data["null_proportion"].max()

min_min = data["min"].min()
max_min = data["min"].max()

min_max = data["max"].min()
max_max = data["max"].max()

min_mean = data["mean"].min()
max_mean = data["mean"].max()

min_median = data["median"].min()
max_median = data["median"].max()

min_std = data["std"].min()
max_std = data["std"].max()


def urlize(dataset_name):
    return f"[{dataset_name}](https://huggingface.co/datasets/{dataset_name})"


def filter_data(
    min_num_examples_input, max_num_examples_input,
    min_null_count_input, max_null_count_input,
    min_null_prop_input, max_null_prop_input,
    min_min_input, max_min_input,
    min_max_input, max_max_input,
    min_mean_input, max_mean_input,
    min_median_input, max_median_input,
    min_std_input, max_std_input,
    sort_by,
    column_name,
    include_partial = False,
):

    df = data.filter(
        (pl.col("num_examples") >= min_num_examples_input) & (pl.col("num_examples") <= max_num_examples_input) &
        (pl.col("null_count") >= min_null_count_input) & (pl.col("null_count") <= max_null_count_input) &
        (pl.col("null_proportion") >= min_null_prop_input) & (pl.col("null_proportion") <= max_null_prop_input) &
        (pl.col("min") >= min_min_input) & (pl.col("min") <= max_min_input) &
        (pl.col("max") >= min_max_input) & (pl.col("max") <= max_max_input) &
        (pl.col("mean") >= min_mean_input) & (pl.col("mean") <= max_mean_input) &
        (pl.col("median") >= min_median_input) & (pl.col("median") <= max_median_input) &
        (pl.col("std") >= min_std_input) & (pl.col("std") <= max_std_input)
    )
    if not include_partial:
        df = df.filter((pl.col("partial") == include_partial))
    if column_name:
        df = df.filter(pl.col("column_name") == column_name)
    if sort_by:
        try:
            sort_cols, sort_descs = parse_sort_by(sort_by)
        except:
            return [pd.DataFrame(), "incorrect sort string format"]
        logging.info(sort_cols)
        logging.info(sort_descs)
        df = df.sort(
        *sort_cols, descending=sort_descs if len(sort_descs) > 1 else sort_descs[0],
    )
    n_rows = df.shape[0]
    n_splits = df.group_by(["dataset", "config", "split"]).len().shape[0]
    n_datasets = df["dataset"].n_unique()

    max_rows = 100

    text = f"{n_rows} rows / {n_splits} unique splits / {n_datasets} unique datasets found{' (100 rows displayed).' if n_rows > max_rows else '.'} \n"
    df = df.to_pandas()
    df["dataset"] = df["dataset"].apply(urlize)
    df = df.drop("histogram", axis=1)
    logging.info(df.head(2))
    if df.shape[0] > max_rows:
        return df.head(max_rows), text
    return df, text


def parse_sort_by(sort_string):
    args = sort_string.split(";")
    col_names, descs = [], []
    for arg in args:
        col_name, desc = arg.split(":")
        col_names.append(col_name)
        descs.append(True if desc == "desc" else False)
    return col_names, descs


with gr.Blocks() as demo:
    gr.Markdown(
        """
        # 💫 Filter text datasets by string statistics 💫
        
        ### The raw data is here:
        """)

    html_code = f"""
    <iframe
      src="https://huggingface.co/datasets/polinaeterna/hub_datasets_string_statistics/embed/viewer/default/train"
      frameborder="0"
      width="100%"
      height="560px"
    ></iframe>
    """
    gr.HTML(value=html_code)

    gr.Markdown("- Number of examples range")
    with gr.Row():
        with gr.Column():
            min_num_examples_input = gr.Slider(min_num_examples, max_num_examples, min_num_examples, step=1, label="Min num examples value")
        with gr.Column():
            max_num_examples_input = gr.Slider(min_num_examples, max_num_examples, max_num_examples, step=1, label="Max num examples value")

    gr.Markdown("- Null count range")
    with gr.Row():
        with gr.Column():
            min_null_count_input = gr.Slider(min_null_count, max_null_count, min_null_count, step=1, label="Min null count value")
        with gr.Column():
            max_null_count_input = gr.Slider(min_null_count, max_null_count, max_null_count, step=1, label="Max null count value")

    gr.Markdown("- Null proportion range")
    with gr.Row():
        with gr.Column():
            min_null_prop_input = gr.Slider(min_null_prop, max_null_prop, min_null_prop, step=0.01, label="Min null proportion value")
        with gr.Column():
            max_null_prop_input = gr.Slider(min_null_prop, max_null_prop, max_null_prop, step=0.01, label="Max null proportion value")

    gr.Markdown("- Minimum string length (in symbols) range")
    with gr.Row():
        with gr.Column():
            min_min_input = gr.Slider(min_min, max_min, min_min, step=1, label="Min min value")
        with gr.Column():
            max_min_input = gr.Slider(min_min, max_min, max_min, step=1, label="Max min value")

    gr.Markdown("- Maximum string length (in symbols) range")
    with gr.Row():
        with gr.Column():
            min_max_input = gr.Slider(min_max, max_max, min_max, step=1, label="Min max value")
        with gr.Column():
            max_max_input = gr.Slider(min_max, max_max, max_max, step=1, label="Max max value")

    gr.Markdown("- Mean string length (in symbols) range")
    with gr.Row():
        with gr.Column():
            min_mean_input = gr.Slider(min_mean, max_mean, min_mean, step=1, label="Min mean value")
        with gr.Column():
            max_mean_input = gr.Slider(min_mean, max_mean, max_mean, step=1, label="Max mean value")

    gr.Markdown("- Median string length (in symbols) range")
    with gr.Row():
        with gr.Column():
            min_median_input = gr.Slider(min_median, max_median, min_median, step=1, label="Min median value")
        with gr.Column():
            max_median_input = gr.Slider(min_median, max_median, max_median, step=1, label="Max median value")

    gr.Markdown("- Standard deviation of string length (in symbols) range")
    with gr.Row():
        with gr.Column():
            min_std_input = gr.Slider(min_std, max_std, min_std, step=1, label="Min std value")
        with gr.Column():
            max_std_input = gr.Slider(min_std, max_std, max_std, step=1, label="Max std value")

    sort_by = gr.Textbox(placeholder="num_examples:desc;std:asc;null_proportion:asc", label="Sort by (optional), in the following format: '<column_name_1>:desc/asc;<column_name_2>:desc/asc'")
    column_name = gr.Textbox(placeholder="text", label="Column name, if you want to check only specific column (optional)")
    include_partial = gr.Checkbox(False, label="Include partial datasets")
    # max_rows = gr.Number(100, )
    btn = gr.Button("Get datasets")
    summary = gr.Markdown()
    datasets = gr.DataFrame(datatype="markdown")
    btn.click(filter_data, inputs=[
        min_num_examples_input, max_num_examples_input,
        min_null_count_input, max_null_count_input,
        min_null_prop_input, max_null_prop_input,
        min_min_input, max_min_input,
        min_max_input, max_max_input,
        min_mean_input, max_mean_input,
        min_median_input, max_median_input,
        min_std_input, max_std_input,
        sort_by,
        column_name,
        include_partial,
    ], outputs=[datasets, summary])

demo.launch(debug=True)