Ensemble Model untuk Klasifikasi Emosi Multi-Label
Ini adalah repositori untuk sistem model ensemble yang meraih peringkat pertama dalam tugas klasifikasi emosi multi-label. Sistem ini menggabungkan dua model kuat, DeBERTa-v3-Large dan RoBERTa-Large, yang dilatih dengan teknik LLRD (Layer-wise Learning Rate Decay) dan Focal Loss.
Komponen Ensemble
deberta_model
: Modelmicrosoft/deberta-v3-large
yang telah di-fine-tune.roberta_model
: Modelroberta-large
yang telah di-fine-tune.best_thresholds.json
: Array berisi 14 nilai threshold optimal untuk setiap label, yang digunakan pada hasil rata-rata probabilitas kedua model.
Cara Menggunakan
Berikut adalah contoh kode untuk memuat semua komponen dan melakukan prediksi dengan ensemble ini:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import expit as sigmoid
import json
import requests
import numpy as np
# -- Informasi Repositori --
REPO_ID = "Trentz/emotion-classification-ensemble"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# -- Label Mapping --
LABELS = ['amusement', 'anger', 'annoyance', 'caring', 'confusion', 'disappointment', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'joy', 'love', 'sadness']
class EmotionEnsemble:
def __init__(self, repo_id, device="cpu"):
self.device = device
print("Memuat semua komponen model...")
# Muat DeBERTa
self.deberta_tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder="deberta_model")
self.deberta_model = AutoModelForSequenceClassification.from_pretrained(repo_id, subfolder="deberta_model").to(self.device).eval()
# Muat RoBERTa
self.roberta_tokenizer = AutoTokenizer.from_pretrained(repo_id, subfolder="roberta_model")
self.roberta_model = AutoModelForSequenceClassification.from_pretrained(repo_id, subfolder="roberta_model").to(self.device).eval()
# Muat thresholds
thresholds_url = f"[https://huggingface.co/](https://huggingface.co/)Trentz/emotion-classification-ensemble/resolve/main/best_thresholds.json"
response = requests.get(thresholds_url)
self.thresholds = torch.tensor(response.json(), device=self.device)
print("Semua komponen berhasil dimuat.")
def predict(self, text: str):
with torch.no_grad():
# Prediksi DeBERTa
deberta_inputs = self.deberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
deberta_probs = torch.sigmoid(self.deberta_model(**deberta_inputs).logits).squeeze()
# Prediksi RoBERTa
roberta_inputs = self.roberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(self.device)
roberta_probs = torch.sigmoid(self.roberta_model(**roberta_inputs).logits).squeeze()
# Rata-ratakan probabilitas
avg_probs = (deberta_probs + roberta_probs) / 2.0
# Terapkan threshold & logika "Best Guess"
preds = (avg_probs > self.thresholds).int()
if preds.sum() == 0:
best_guess_idx = torch.argmax(avg_probs).item()
final_labels = [LABELS[best_guess_idx]]
else:
final_labels = [LABELS[i] for i, pred in enumerate(preds) if pred == 1]
return { "text": text, "predicted_emotions": final_labels, "scores": avg_probs.cpu().tolist() }
# -- Contoh Penggunaan --
# Inisialisasi model ensemble
ensemble_model = EmotionEnsemble(REPO_ID, device=DEVICE)
# Prediksi teks
example_text = "This is amazing! Thank you so much for everything, I really love it."
result = ensemble_model.predict(example_text)
print(result)
# Diharapkan output mengandung: 'amusement', 'excitement', 'joy', 'love', 'gratitude'
example_text_2 = "I can't believe you would do that. It's so annoying and disappointing."
result_2 = ensemble_model.predict(example_text_2)
print(result_2)
# Diharapkan output mengandung: 'annoyance', 'disappointment', 'anger'