File size: 3,305 Bytes
5faa10b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import sys
tabpfn_path = 'TabPFN'
sys.path.insert(0, tabpfn_path) # our submodule of the TabPFN repo (at 045c8400203ebd062346970b4f2c0ccda5a40618)
from TabPFN.scripts.transformer_prediction_interface import TabPFNClassifier

import numpy as np
import pandas as pd
import torch
import gradio as gr
import openml
from sklearn.model_selection import cross_val_score


def compute(file, y_attribute, cv_folds):
    if file is None:
        return 'Please upload a .arff file', y_attribute
    if file.name.endswith('.arff'):
        dataset = openml.datasets.OpenMLDataset('t', 'test', data_file=file.name)
        X_, _, categorical_indicator_, attribute_names_ = dataset.get_data(
            dataset_format="array")
        if y_attribute not in attribute_names_:
            return f"**Select attribute from {', '.join(attribute_names_)}**", y_attribute
        X, y, categorical_indicator_, attribute_names_ = dataset.get_data(
            dataset_format="array", target=y_attribute)
    else:
        return 'Please upload a .arff file', y_attribute

    order = np.arange(y.shape[0])
    np.random.seed(13)
    np.random.shuffle(order)
    X, y = torch.tensor(X[order]), torch.tensor(y[order])

    classifier = TabPFNClassifier(base_path=tabpfn_path, device='cpu')

    scores = cross_val_score(classifier, X, y, cv=cv_folds, scoring='roc_auc_ovo')
    print(scores)
    # classifier.fit(x_train, y_train)
    # y_eval, p_eval = classifier.predict(x_eval, return_winning_probability=True)

    # print(file, type(file))
    return f"ROC AUC OVO Cross Val mean is {sum(scores) / len(scores)} from {scores}. " + (
        "The PFN is only trained for datasets with up to 1024 training examples and it had to extrapolate to greater datasets for this evaluation." if len(
            y) // cv_folds > 1024 else ""), y_attribute


def upload_file(file):
    if file is None:
        return
    if file.name.endswith('.arff'):
        dataset = openml.datasets.OpenMLDataset('t', 'test', data_file=file.name)
        print(y_attribute)
        X_, _, categorical_indicator_, attribute_names_ = dataset.get_data(
            dataset_format="array")
        return f"Select attribute from {', '.join(attribute_names_)}", attribute_names_[-1]
    else:
        return 'Please upload a .arff file', None


with gr.Blocks() as demo:
    gr.Markdown("""This demo allows you to play with the **TabPFN**.
    Upload a .arff file, select an attribute to predict and the number of cross validation folds and get the ROC AUC OVO score for one seed.
    """)
    inp_file = gr.File(
        label='Drop a .arff file.')
    cv_folds = gr.Dropdown([2, 3, 4, 5], value=2, label='Number of CV folds')
    out_text = gr.Markdown()

    y_attribute = gr.Textbox(label='y attribute')

    examples = gr.Examples(examples=['balance-scale.arff'],
                           inputs=[inp_file],
                           outputs=[out_text, y_attribute],
                           fn=upload_file,
                           cache_examples=True)
    btn = gr.Button("Predict Empty Table Cells")
    # out_table = gr.DataFrame()
    inp_file.change(fn=upload_file, inputs=inp_file, outputs=[out_text, y_attribute])

    btn.click(fn=compute, inputs=[inp_file, y_attribute, cv_folds], outputs=[out_text, y_attribute])

demo.launch()