File size: 3,475 Bytes
1ccdd5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98f0f12
 
 
1ccdd5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7943d0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import sys
tabpfn_path = 'TabPFN'
sys.path.insert(0, tabpfn_path) # our submodule of the TabPFN repo (at 045c8400203ebd062346970b4f2c0ccda5a40618)
from TabPFN.scripts.transformer_prediction_interface import TabPFNClassifier

import numpy as np
import pandas as pd
import torch
import gradio as gr
import openml


def compute(table: np.array):
    vfunc = np.vectorize(lambda s: len(s))
    non_empty_row_mask = (vfunc(table).sum(1) != 0)
    table = table[non_empty_row_mask]
    empty_mask = table == ''
    empty_inds = np.where(empty_mask)
    if not len(empty_inds[0]):
        return "**Please leave at least one field blank for prediction.**", None
    if not np.all(empty_inds[1][0] == empty_inds[1]):
        return "**Please only leave fields of one column blank for prediction.**", None
    y_column = empty_inds[1][0]
    eval_lines = empty_inds[0]

    train_table = np.delete(table, eval_lines, axis=0)
    eval_table = table[eval_lines]

    try:
        x_train = torch.tensor(np.delete(train_table, y_column, axis=1).astype(np.float32))
        x_eval = torch.tensor(np.delete(eval_table, y_column, axis=1).astype(np.float32))

        y_train = train_table[:, y_column]
    except ValueError:
        return "**Please only add numbers (to the inputs) or leave fields empty.**", None

    classifier = TabPFNClassifier(base_path=tabpfn_path, device='cpu')
    classifier.fit(x_train, y_train)
    y_eval, p_eval = classifier.predict(x_eval, return_winning_probability=True)

    # print(file, type(file))
    out_table = table.copy().astype(str)
    out_table[eval_lines, y_column] = [f"{y_e} (p={p_e:.2f})" for y_e, p_e in zip(y_eval, p_eval)]
    return None, out_table


def upload_file(file):
    if file.name.endswith('.arff'):
        dataset = openml.datasets.OpenMLDataset('t', 'test', data_file=file.name)
        X_, _, categorical_indicator_, attribute_names_ = dataset.get_data(
            dataset_format="array"
        )
        df = pd.DataFrame(X_, columns=attribute_names_)
        return df
    elif file.name.endswith('.csv') or file.name.endswith('.data'):
        df = pd.read_csv(file.name, header=None)
        df.columns = np.arange(len(df.columns))
        print(df)
        return df


example = \
    [
        [1, 2, 1],
        [2, 1, 1],
        [1, 1, 1],
        [2, 2, 2],
        [3, 4, 2],
        [3, 2, 2],
        [2, 3, '']
    ]

with gr.Blocks() as demo:
    gr.Markdown("""This demo allows you to play with the **TabPFN**.
The TabPFN will predict the values for all empty cells in one column.
Please, provide everything but the label column as numeric values. It is ok to encode classes as integers.
You can also upload datasets to fill the table automatically.
    """)
    inp_table = gr.DataFrame(type='numpy', value=example, headers=[''] * 3)
    inp_file = gr.File(
        label='Drop either a .csv (without header, only numeric values for all but the labels) or a .arff file.')
    examples = gr.Examples(examples=['iris.csv', 'balance-scale.arff'],
                           inputs=[inp_file],
                           outputs=[inp_table],
                           fn=upload_file,
                           cache_examples=True)
    btn = gr.Button("Predict Empty Table Cells")

    inp_file.change(fn=upload_file, inputs=inp_file, outputs=inp_table)

    out_text = gr.Markdown()
    out_table = gr.DataFrame()

    btn.click(fn=compute, inputs=inp_table, outputs=[out_text, out_table])

demo.launch()