import gradio as gr from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments from datasets import Dataset from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import pandas as pd def preprocess_data(file): df = pd.read_csv(file.name) X = df.drop(columns=["target_column"]) y = df["target_column"] return X, y def load_model(model_name="bert-base-uncased"): model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) tokenizer = AutoTokenizer.from_pretrained(model_name) return model, tokenizer def setup_training_args(learning_rate=5e-5, epochs=3, batch_size=8): return TrainingArguments( output_dir="./results", num_train_epochs=epochs, per_device_train_batch_size=batch_size, learning_rate=learning_rate, evaluation_strategy="epoch", logging_dir="./logs" ) def train_model(model, tokenizer, X_train, y_train, training_args): train_encodings = tokenizer(list(X_train), truncation=True, padding=True) train_dataset = Dataset.from_dict({ "input_ids": train_encodings['input_ids'], "attention_mask": train_encodings['attention_mask'], "labels": y_train.tolist() }) trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset) trainer.train() return model def evaluate_model(model, tokenizer, X_test, y_test): test_encodings = tokenizer(list(X_test), truncation=True, padding=True) test_dataset = Dataset.from_dict({ "input_ids": test_encodings['input_ids'], "attention_mask": test_encodings['attention_mask'], "labels": y_test.tolist() }) trainer = Trainer(model=model) predictions = trainer.predict(test_dataset) preds = predictions.predictions.argmax(axis=-1) accuracy = accuracy_score(y_test, preds) return accuracy def auto_train(file, model_name="bert-base-uncased", learning_rate=5e-5, epochs=3, batch_size=8): X_train, y_train = preprocess_data(file) model, tokenizer = load_model(model_name) training_args = setup_training_args(learning_rate, epochs, batch_size) model = train_model(model, tokenizer, X_train, y_train, training_args) accuracy = evaluate_model(model, tokenizer, X_train, y_train) return f"Training Complete! Accuracy: {accuracy * 100:.2f}%" iface = gr.Interface(fn=auto_train, inputs=[ gr.File(label="Upload Dataset (CSV)",file_types=["csv","json"]), gr.Dropdown(choices=["bert-base-uncased", "distilbert-base-uncased", "roberta-base"], label="Model Selection"), gr.Slider(minimum=1, maximum=5, step=1, label="Epochs", value=3), gr.Slider(minimum=1, maximum=16, step=1, label="Batch Size", value=8), gr.Slider(minimum=1e-6, maximum=1e-4, step=1e-5, label="Learning Rate", value=5e-5) ], outputs="text", live=True, title="AutoTrain Replica", description="Train models on your dataset with custom hyperparameters.") iface.launch()