In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import torch
from torch.utils.data import Dataset
import shutil
from tqdm import tqdm
import numpy as np

# === 0. Define model and output paths ===
MODEL_NAME = "boltuix/NeuroBERT"
OUTPUT_DIR = "./neuro-nearby"

# === 1. Custom callback for tqdm progress bar ===
class TQDMProgressBarCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.progress_bar = None

    def on_train_begin(self, args, state, control, **kwargs):
        self.total_steps = state.max_steps
        self.progress_bar = tqdm(total=self.total_steps, desc="Training", unit="step")

    def on_step_end(self, args, state, control, **kwargs):
        self.progress_bar.update(1)
        self.progress_bar.set_postfix({
            "epoch": f"{state.epoch:.2f}",
            "step": state.global_step
        })

    def on_train_end(self, args, state, control, **kwargs):
        if self.progress_bar is not None:
            self.progress_bar.close()
            self.progress_bar = None

# === 2. Load and preprocess data ===
dataset_path = 'dataset.csv'
df = pd.read_csv(dataset_path)
df = df.dropna(subset=['category'])
df.columns = ['label', 'text']  # Rename columns

# === 3. Encode labels ===
labels = sorted(df["label"].unique())
label_to_id = {label: idx for idx, label in enumerate(labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}
df['label'] = df['label'].map(label_to_id)

# === 4. Train-val split ===
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42, stratify=df['label']
)

# === 5. Tokenizer ===
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

# === 6. Dataset class ===
class CategoryDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# === 7. Load datasets ===
train_dataset = CategoryDataset(train_texts, train_labels, tokenizer)
val_dataset = CategoryDataset(val_texts, val_labels, tokenizer)

# === 8. Load model with num_labels ===
model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_to_id)
)

# === 9. Define metrics for evaluation ===
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {
        'accuracy': acc,
        'f1_weighted': f1,
    }

# === 10. Training arguments ===
training_args = TrainingArguments(
    output_dir='./results',
    run_name="NeuroNearby",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",  # Corrected from evaluation_strategy
    report_to="none"
)


# === 11. Trainer setup ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[TQDMProgressBarCallback()]
)

# === 12. Train and evaluate ===
trainer.train()
trainer.evaluate()

# === 13. Save model and tokenizer ===
model.config.label2id = label_to_id
model.config.id2label = id_to_label
model.config.num_labels = len(label_to_id)

model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# === 14. Zip model directory ===
shutil.make_archive("neuro-nearby", 'zip', OUTPUT_DIR)
print("✅ Training complete. Model and tokenizer saved to ./neuro-nearby")
print("✅ Model directory zipped to neuro-nearby.zip")

# === 15. Test function with confidence threshold ===
def run_test_cases(model, tokenizer, test_sentences, label_to_id, id_to_label, confidence_threshold=0.5):
    model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    correct = 0
    total = len(test_sentences)
    results = []

    for text, expected_label in test_sentences:
        encoding = tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=128,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
            max_prob, predicted_id = torch.max(probs, dim=1)
            predicted_label = id_to_label[predicted_id.item()]
            # Apply confidence threshold
            if max_prob.item() < confidence_threshold:
                predicted_label = "unknown"

        is_correct = (predicted_label == expected_label)
        if is_correct:
            correct += 1
        results.append({
            "sentence": text,
            "expected": expected_label,
            "predicted": predicted_label,
            "confidence": max_prob.item(),
            "correct": is_correct
        })

    accuracy = correct / total * 100
    print(f"\nTest Cases Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")

    for r in results:
        status = "✓" if r["correct"] else "✗"
        print(f"{status} '{r['sentence']}'")
        print(f"   Expected: {r['expected']}, Predicted: {r['predicted']}, Confidence: {r['confidence']:.3f}")

    assert accuracy >= 70, f"Test failed: Accuracy {accuracy:.2f}% < 70%"
    return results

# === 16. Sample test sentences for testing ===
test_sentences = [
    ("Where is the nearest airport to this location?", "airport"),
    ("Can I bring a laptop through airport security?", "airport"),
    ("How do I get to the closest airport terminal?", "airport"),
    ("Need help finding an accounting firm for tax planning.", "accounting firm"),
    ("Can an accounting firm help with financial audits?", "accounting firm"),
    ("Looking for an accounting firm to manage payroll.", "accounting firm"),
    # Add more diverse sentences covering your 155 categories
]

print("\nRunning test cases...")
test_results = run_test_cases(model, tokenizer, test_sentences, label_to_id, id_to_label)
print("✅ Test cases completed.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/57.5M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at boltuix/NeuroBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training:   0%|          | 1/152460 [00:01<49:57:16,  1.18s/step, epoch=0.00, step=1]

Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,0.1739,0.182693,0.953562,0.953709
2,0.2205,0.153253,0.961957,0.961822
3,0.0578,0.125174,0.969599,0.969571
4,0.0856,0.118067,0.972427,0.972436
5,0.0354,0.115079,0.973632,0.973634


Training: 100%|██████████| 152460/152460 [2:12:36<00:00, 19.16step/s, epoch=5.00, step=152460]


✅ Training complete. Model and tokenizer saved to ./neuro-nearby
✅ Model directory zipped to neuro-nearby.zip

Running test cases...

Test Cases Accuracy: 100.00% (6/6 correct)
✓ 'Where is the nearest airport to this location?'
   Expected: airport, Predicted: airport, Confidence: 1.000
✓ 'Can I bring a laptop through airport security?'
   Expected: airport, Predicted: airport, Confidence: 1.000
✓ 'How do I get to the closest airport terminal?'
   Expected: airport, Predicted: airport, Confidence: 1.000
✓ 'Need help finding an accounting firm for tax planning.'
   Expected: accounting firm, Predicted: accounting firm, Confidence: 1.000
✓ 'Can an accounting firm help with financial audits?'
   Expected: accounting firm, Predicted: accounting firm, Confidence: 1.000
✓ 'Looking for an accounting firm to manage payroll.'
   Expected: accounting firm, Predicted: accounting firm, Confidence: 1.000
✅ Test cases completed.


In [2]:
# === 16. Sample test sentences for testing ===
test_sentences = [
    ("show me a store that sells snow shoes", "shoe store"),
    ("Is there a hotline for reporting a malfunctioning streetlight near City Hall?", "city hall"),
    ("i need to get my allergy medication refilled", "pharmacy"),
    ("i have a headache", "pharmacy"),
    ("Looking for a shop with bike chains.", "bicycle store"),
]
print("\nRunning test cases...")
test_results = run_test_cases(model, tokenizer, test_sentences, label_to_id, id_to_label)
print("✅ Test cases completed.")


Running test cases...

Test Cases Accuracy: 100.00% (3/3 correct)
✓ 'show me a store that sells snow shoes'
   Expected: shoe store, Predicted: shoe store, Confidence: 0.999
✓ 'Is there a hotline for reporting a malfunctioning streetlight near City Hall?'
   Expected: city hall, Predicted: city hall, Confidence: 1.000
✓ 'i need to get my allergy medication refilled'
   Expected: pharmacy, Predicted: pharmacy, Confidence: 0.999
✅ Test cases completed.
