|
import gradio as gr |
|
import torch |
|
import torch.nn as nn |
|
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence |
|
import numpy as np |
|
from huggingface_hub import hf_hub_download |
|
import pickle |
|
import json |
|
import nltk |
|
nltk.download('punkt') |
|
nltk.download('punkt_tab') |
|
from nltk.tokenize import word_tokenize |
|
|
|
|
|
|
|
|
|
|
|
|
|
ml1_model = pickle.load(open(hf_hub_download("InfZnDipl/ML_LogReg", "model.pkl"), "rb")) |
|
ml1_vectorizer = pickle.load(open(hf_hub_download("InfZnDipl/ML_LogReg", "vectorizer.pkl"), "rb")) |
|
|
|
|
|
ml2_model = pickle.load(open(hf_hub_download("InfZnDipl/ML_N_Bayes", "model.pkl"), "rb")) |
|
ml2_vectorizer = pickle.load(open(hf_hub_download("InfZnDipl/ML_N_Bayes", "vectorizer.pkl"), "rb")) |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
dl2_model = torch.load(hf_hub_download("InfZnDipl/CNN_TRAIN", "CNN1-T-t11.pt"), map_location=device) |
|
|
|
class SentimentCNN(nn.Module): |
|
def __init__(self, embedding_matrix, output_dim=3, filter_sizes=[3,4,5], num_filters=350, dropout=0.458): |
|
super().__init__() |
|
vocab_size, embedding_dim = embedding_matrix.shape |
|
self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False) |
|
|
|
self.convs = nn.ModuleList([ |
|
nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes |
|
]) |
|
|
|
self.fc = nn.Linear(num_filters * len(filter_sizes), output_dim) |
|
self.dropout = nn.Dropout(dropout) |
|
|
|
def forward(self, x): |
|
x = self.embedding(x).unsqueeze(1) |
|
x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs] |
|
x = [torch.max(i, dim=2)[0] for i in x] |
|
x = torch.cat(x, dim=1) |
|
x = self.dropout(x) |
|
return self.fc(x) |
|
|
|
with open(hf_hub_download("InfZnDipl/CNN_TRAIN", "vocab_cnn_nltk_TRAIN.json"), "r") as f: |
|
vocab = json.load(f) |
|
|
|
embedding_matrix = np.load(hf_hub_download("InfZnDipl/CNN_TRAIN", "embedding_matrix.npy")) |
|
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float) |
|
|
|
dl2_model = SentimentCNN( |
|
embedding_matrix=embedding_matrix, |
|
num_filters=275, |
|
output_dim=3, |
|
filter_sizes=[3, 4, 5], |
|
dropout=0.45827226824902384 |
|
) |
|
|
|
dl2_model.load_state_dict(torch.load(hf_hub_download("InfZnDipl/CNN_TRAIN", "CNN1-T-t11.pt"), map_location=device)) |
|
dl2_model.to(device) |
|
dl2_model.eval() |
|
|
|
PAD_IDX = 0 |
|
UNK_IDX = vocab.get("<UNK>", 1) |
|
MAX_LEN = 100 |
|
|
|
def preprocess_dl_text(text): |
|
tokens = word_tokenize(text.lower()) |
|
ids = [vocab.get(token, UNK_IDX) for token in tokens] |
|
length = len(ids) |
|
|
|
if length < MAX_LEN: |
|
ids += [PAD_IDX] * (MAX_LEN - length) |
|
else: |
|
ids = ids[:MAX_LEN] |
|
length = MAX_LEN |
|
|
|
ids_tensor = torch.tensor([ids], dtype=torch.long) |
|
length_tensor = torch.tensor([length], dtype=torch.long) |
|
|
|
return ids_tensor, length_tensor |
|
|
|
|
|
|
|
class LSTM(nn.Module): |
|
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, |
|
bidirectional, dropout_rate, pad_index, pretrained_embeddings=None): |
|
super().__init__() |
|
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index) |
|
if pretrained_embeddings is not None: |
|
self.embedding.weight.data.copy_(pretrained_embeddings) |
|
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, |
|
bidirectional=bidirectional, dropout=dropout_rate if n_layers > 1 else 0, |
|
batch_first=True) |
|
self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim) |
|
self.dropout = nn.Dropout(dropout_rate) |
|
|
|
def forward(self, ids, length): |
|
embedded = self.dropout(self.embedding(ids)) |
|
packed_embedded = pack_padded_sequence(embedded, length.cpu(), batch_first=True, enforce_sorted=False) |
|
_, (hidden, _) = self.lstm(packed_embedded) |
|
if self.lstm.bidirectional: |
|
hidden = self.dropout(torch.cat([hidden[-2], hidden[-1]], dim=1)) |
|
else: |
|
hidden = self.dropout(hidden[-1]) |
|
return self.fc(hidden) |
|
|
|
rnn_vocab_path = hf_hub_download("InfZnDipl/RNN_LSTM_TRAIN", "vocab-NOVO-6.json") |
|
rnn_embeddings_path = hf_hub_download("InfZnDipl/RNN_LSTM_TRAIN", "embeddings-NOVO-6.pt") |
|
rnn_weights_path = hf_hub_download("InfZnDipl/RNN_LSTM_TRAIN", "RNN-NOVO-T-t2-6.pt") |
|
rnn_params_path = hf_hub_download("InfZnDipl/RNN_LSTM_TRAIN", "RNN-params-NOVO-6.json") |
|
|
|
with open(rnn_vocab_path, "r") as f: |
|
rnn_vocab = json.load(f) |
|
|
|
rnn_embeddings = torch.load(rnn_embeddings_path, map_location=device) |
|
embedding_dim = rnn_embeddings.shape[1] |
|
output_dim = 3 |
|
bidirectional = True |
|
PAD_IDX = 0 |
|
UNK_IDX = rnn_vocab.get("<UNK>", 1) |
|
|
|
with open(rnn_params_path, "r") as f: |
|
rnn_params = json.load(f) |
|
|
|
dl1_model = LSTM( |
|
vocab_size=rnn_embeddings.shape[0], |
|
embedding_dim=embedding_dim, |
|
hidden_dim=rnn_params["hidden_dim"], |
|
output_dim=output_dim, |
|
n_layers=rnn_params["n_layers"], |
|
bidirectional=bidirectional, |
|
dropout_rate=rnn_params["dropout_rate"], |
|
pad_index=PAD_IDX, |
|
pretrained_embeddings=rnn_embeddings |
|
).to(device) |
|
|
|
dl1_model.load_state_dict(torch.load(rnn_weights_path, map_location=device)) |
|
dl1_model.eval() |
|
|
|
def preprocess_rnn_text(text): |
|
tokens = word_tokenize(text.lower()) |
|
ids = [rnn_vocab.get(token, UNK_IDX) for token in tokens] |
|
length = len(ids) |
|
|
|
if length < MAX_LEN: |
|
ids += [PAD_IDX] * (MAX_LEN - length) |
|
else: |
|
ids = ids[:MAX_LEN] |
|
length = MAX_LEN |
|
|
|
ids_tensor = torch.tensor([ids], dtype=torch.long) |
|
length_tensor = torch.tensor([length], dtype=torch.long) |
|
return ids_tensor.to(device), length_tensor.to(device) |
|
|
|
|
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
|
trans1_tokenizer = AutoTokenizer.from_pretrained("InfZnDipl/bertic-sentiment_TRAIN") |
|
trans1_model = AutoModelForSequenceClassification.from_pretrained("InfZnDipl/bertic-sentiment_TRAIN") |
|
|
|
trans2_tokenizer = AutoTokenizer.from_pretrained("InfZnDipl/croslo-sentiment_TRAIN") |
|
trans2_model = AutoModelForSequenceClassification.from_pretrained("InfZnDipl/croslo-sentiment_TRAIN") |
|
|
|
|
|
|
|
def get_label(index): |
|
return ["Positive", "Neutral", "Negative"][index] |
|
|
|
def predict_ml(text, model, vectorizer): |
|
X = vectorizer.transform([text]) |
|
pred = model.predict(X)[0] |
|
return get_label(pred) |
|
|
|
def predict_dl(text, model, model_type="lstm"): |
|
if model_type == "lstm": |
|
ids_tensor, length_tensor = preprocess_rnn_text(text) |
|
else: |
|
ids_tensor, length_tensor = preprocess_dl_text(text) |
|
|
|
ids_tensor = ids_tensor.to(device) |
|
|
|
with torch.no_grad(): |
|
if model_type == "lstm": |
|
logits = model(ids_tensor, length_tensor) |
|
else: |
|
logits = model(ids_tensor) |
|
|
|
if isinstance(logits, tuple): |
|
logits = logits[0] |
|
|
|
pred = torch.argmax(logits, dim=1).item() |
|
return get_label(pred) |
|
|
|
|
|
def predict_transformer(text, tokenizer, model): |
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) |
|
with torch.no_grad(): |
|
logits = model(**inputs).logits |
|
pred = torch.argmax(logits, dim=1).item() |
|
return get_label(pred) |
|
|
|
def analyze_sentiment(text): |
|
results = { |
|
"ML Logistic Regression": predict_ml(text, ml1_model, ml1_vectorizer), |
|
"ML Naive Bayes": predict_ml(text, ml2_model, ml2_vectorizer), |
|
"DL Model: RNN-LSTM": predict_dl(text, dl1_model, model_type="lstm"), |
|
"DL Model: CNN": predict_dl(text, dl2_model, model_type="cnn"), |
|
"Transformer: BERTIC": predict_transformer(text, trans1_tokenizer, trans1_model), |
|
"Transformer: CROSLO": predict_transformer(text, trans2_tokenizer, trans2_model), |
|
} |
|
return results |
|
|
|
|
|
|
|
|
|
|
|
def gradio_interface(text): |
|
results = analyze_sentiment(text) |
|
return [ |
|
results["ML Logistic Regression"], |
|
results["ML Naive Bayes"], |
|
results["DL Model: RNN-LSTM"], |
|
results["DL Model: CNN"], |
|
results["Transformer: BERTIC"], |
|
results["Transformer: CROSLO"] |
|
] |
|
|
|
with gr.Blocks(css=""" |
|
.gr-button-primary { |
|
background-color: orange !important; |
|
border: none !important; |
|
color: white !important; |
|
} |
|
#title, .section-title { |
|
text-align: center; |
|
} |
|
.gr-textbox { |
|
border: 1px solid white !important; |
|
} |
|
""") as demo: |
|
gr.Markdown( |
|
""" |
|
# Sentiment Analysis Demo / Analiza Sentimenta |
|
Enter a sentence in **Croatian** to see how 6 different models classify its sentiment: **positive**, **neutral**, or **negative**. |
|
Unesite rečenicu na **hrvatskom jeziku** da biste vidjeli kako je klasificiraju 6 različitih modela: **pozitivno**, **neutralno** ili **negativno**. |
|
""", |
|
elem_id="title" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
text_input = gr.Textbox( |
|
label="Enter a sentence / Unesite rečenicu", |
|
placeholder="Npr. Ovo je najbolji film ikad!", |
|
lines=2 |
|
) |
|
|
|
submit_btn = gr.Button("Analyze / Analiziraj", variant="primary") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### Machine Learning", elem_classes=["section-title"]) |
|
ml1_out = gr.Textbox(label="Logistic Regression") |
|
ml2_out = gr.Textbox(label="Naive Bayes") |
|
|
|
with gr.Column(): |
|
gr.Markdown("### Deep Learning", elem_classes=["section-title"]) |
|
dl1_out = gr.Textbox(label="RNN-LSTM") |
|
dl2_out = gr.Textbox(label="CNN") |
|
|
|
with gr.Column(): |
|
gr.Markdown("### Transformers", elem_classes=["section-title"]) |
|
trans1_out = gr.Textbox(label="BERTić") |
|
trans2_out = gr.Textbox(label="CroSloBERTa") |
|
|
|
submit_btn.click( |
|
gradio_interface, |
|
inputs=text_input, |
|
outputs=[ml1_out, ml2_out, dl1_out, dl2_out, trans1_out, trans2_out] |
|
) |
|
|
|
demo.launch() |
|
|