Spaces:

InfZnDipl
/

Croatian_movie_reviews_sentiment

Sleeping

App Files Files Community

Croatian_movie_reviews_sentiment / app.py

InfZnDipl

Update app.py

13b48cb verified 30 days ago

raw

history blame contribute delete

10.7 kB

	import gradio as gr
	import torch
	import torch.nn as nn
	from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
	import numpy as np
	from huggingface_hub import hf_hub_download
	import pickle
	import json
	import nltk
	nltk.download('punkt')
	nltk.download('punkt_tab')
	from nltk.tokenize import word_tokenize


	# ========== Load Your Models ==========

	# ML Models
	#logreg
	ml1_model = pickle.load(open(hf_hub_download("InfZnDipl/ML_LogReg", "model.pkl"), "rb"))
	ml1_vectorizer = pickle.load(open(hf_hub_download("InfZnDipl/ML_LogReg", "vectorizer.pkl"), "rb"))

	#Multinomial Naive Bayes
	ml2_model = pickle.load(open(hf_hub_download("InfZnDipl/ML_N_Bayes", "model.pkl"), "rb"))
	ml2_vectorizer = pickle.load(open(hf_hub_download("InfZnDipl/ML_N_Bayes", "vectorizer.pkl"), "rb"))

	# DL Models (LSTM, GRU)
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	#dl1_model = torch.load(hf_hub_download("InfZnDipl/RNN_LSTM_TRAIN", "RNN-T-t3.pt"), map_location=device)
	dl2_model = torch.load(hf_hub_download("InfZnDipl/CNN_TRAIN", "CNN1-T-t11.pt"), map_location=device)

	class SentimentCNN(nn.Module):
	def __init__(self, embedding_matrix, output_dim=3, filter_sizes=[3,4,5], num_filters=350, dropout=0.458):
	super().__init__()
	vocab_size, embedding_dim = embedding_matrix.shape
	self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_matrix), freeze=False)

	self.convs = nn.ModuleList([
	nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes
	])

	self.fc = nn.Linear(num_filters * len(filter_sizes), output_dim)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	x = self.embedding(x).unsqueeze(1) # [B, 1, T, D]
	x = [torch.relu(conv(x)).squeeze(3) for conv in self.convs] # conv outputs: [B, F, T]
	x = [torch.max(i, dim=2)[0] for i in x] # max-pooling: [B, F]
	x = torch.cat(x, dim=1) # [B, F * len(filter_sizes)]
	x = self.dropout(x)
	return self.fc(x) # [B, C]

	with open(hf_hub_download("InfZnDipl/CNN_TRAIN", "vocab_cnn_nltk_TRAIN.json"), "r") as f:
	vocab = json.load(f)

	embedding_matrix = np.load(hf_hub_download("InfZnDipl/CNN_TRAIN", "embedding_matrix.npy"))
	embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float)

	dl2_model = SentimentCNN(
	embedding_matrix=embedding_matrix,
	num_filters=275, # <-- važno: iz HPO/sačuvanog modela
	output_dim=3,
	filter_sizes=[3, 4, 5],
	dropout=0.45827226824902384 # <-- tačan dropout iz HPO
	)

	dl2_model.load_state_dict(torch.load(hf_hub_download("InfZnDipl/CNN_TRAIN", "CNN1-T-t11.pt"), map_location=device))
	dl2_model.to(device)
	dl2_model.eval()

	PAD_IDX = 0
	UNK_IDX = vocab.get("<UNK>", 1)
	MAX_LEN = 100

	def preprocess_dl_text(text):
	tokens = word_tokenize(text.lower())
	ids = [vocab.get(token, UNK_IDX) for token in tokens]
	length = len(ids)

	if length < MAX_LEN:
	ids += [PAD_IDX] * (MAX_LEN - length)
	else:
	ids = ids[:MAX_LEN]
	length = MAX_LEN

	ids_tensor = torch.tensor([ids], dtype=torch.long) # [1, max_len]
	length_tensor = torch.tensor([length], dtype=torch.long) # [1]
	#print("Debug...", tokens, ids, "\|\|\|", ids_tensor, length_tensor)
	return ids_tensor, length_tensor


	# RNN section
	class LSTM(nn.Module):
	def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
	bidirectional, dropout_rate, pad_index, pretrained_embeddings=None):
	super().__init__()
	self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_index)
	if pretrained_embeddings is not None:
	self.embedding.weight.data.copy_(pretrained_embeddings)
	self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers,
	bidirectional=bidirectional, dropout=dropout_rate if n_layers > 1 else 0,
	batch_first=True)
	self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
	self.dropout = nn.Dropout(dropout_rate)

	def forward(self, ids, length):
	embedded = self.dropout(self.embedding(ids))
	packed_embedded = pack_padded_sequence(embedded, length.cpu(), batch_first=True, enforce_sorted=False)
	_, (hidden, _) = self.lstm(packed_embedded)
	if self.lstm.bidirectional:
	hidden = self.dropout(torch.cat([hidden[-2], hidden[-1]], dim=1))
	else:
	hidden = self.dropout(hidden[-1])
	return self.fc(hidden)

	rnn_vocab_path = hf_hub_download("InfZnDipl/RNN_LSTM_TRAIN", "vocab-NOVO-6.json")
	rnn_embeddings_path = hf_hub_download("InfZnDipl/RNN_LSTM_TRAIN", "embeddings-NOVO-6.pt")
	rnn_weights_path = hf_hub_download("InfZnDipl/RNN_LSTM_TRAIN", "RNN-NOVO-T-t2-6.pt")
	rnn_params_path = hf_hub_download("InfZnDipl/RNN_LSTM_TRAIN", "RNN-params-NOVO-6.json")

	with open(rnn_vocab_path, "r") as f:
	rnn_vocab = json.load(f)

	rnn_embeddings = torch.load(rnn_embeddings_path, map_location=device)
	embedding_dim = rnn_embeddings.shape[1]
	output_dim = 3
	bidirectional = True
	PAD_IDX = 0
	UNK_IDX = rnn_vocab.get("<UNK>", 1)

	with open(rnn_params_path, "r") as f:
	rnn_params = json.load(f)

	dl1_model = LSTM(
	vocab_size=rnn_embeddings.shape[0],
	embedding_dim=embedding_dim,
	hidden_dim=rnn_params["hidden_dim"],
	output_dim=output_dim,
	n_layers=rnn_params["n_layers"],
	bidirectional=bidirectional,
	dropout_rate=rnn_params["dropout_rate"],
	pad_index=PAD_IDX,
	pretrained_embeddings=rnn_embeddings
	).to(device)

	dl1_model.load_state_dict(torch.load(rnn_weights_path, map_location=device))
	dl1_model.eval()

	def preprocess_rnn_text(text):
	tokens = word_tokenize(text.lower())
	ids = [rnn_vocab.get(token, UNK_IDX) for token in tokens]
	length = len(ids)

	if length < MAX_LEN:
	ids += [PAD_IDX] * (MAX_LEN - length)
	else:
	ids = ids[:MAX_LEN]
	length = MAX_LEN

	ids_tensor = torch.tensor([ids], dtype=torch.long)
	length_tensor = torch.tensor([length], dtype=torch.long)
	return ids_tensor.to(device), length_tensor.to(device)

	# Transformer Models (e.g., BERT) - Ovo radi, ne diraj
	from transformers import AutoModelForSequenceClassification, AutoTokenizer

	trans1_tokenizer = AutoTokenizer.from_pretrained("InfZnDipl/bertic-sentiment_TRAIN")
	trans1_model = AutoModelForSequenceClassification.from_pretrained("InfZnDipl/bertic-sentiment_TRAIN")

	trans2_tokenizer = AutoTokenizer.from_pretrained("InfZnDipl/croslo-sentiment_TRAIN")
	trans2_model = AutoModelForSequenceClassification.from_pretrained("InfZnDipl/croslo-sentiment_TRAIN")

	# ========== Prediction Functions ==========

	def get_label(index):
	return ["Positive", "Neutral", "Negative"][index]

	def predict_ml(text, model, vectorizer):
	X = vectorizer.transform([text])
	pred = model.predict(X)[0]
	return get_label(pred)

	def predict_dl(text, model, model_type="lstm"):
	if model_type == "lstm":
	ids_tensor, length_tensor = preprocess_rnn_text(text)
	else:
	ids_tensor, length_tensor = preprocess_dl_text(text)

	ids_tensor = ids_tensor.to(device)

	with torch.no_grad():
	if model_type == "lstm":
	logits = model(ids_tensor, length_tensor)
	else:
	logits = model(ids_tensor)

	if isinstance(logits, tuple):
	logits = logits[0]

	pred = torch.argmax(logits, dim=1).item()
	return get_label(pred)


	def predict_transformer(text, tokenizer, model):
	inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
	with torch.no_grad():
	logits = model(**inputs).logits
	pred = torch.argmax(logits, dim=1).item()
	return get_label(pred)

	def analyze_sentiment(text):
	results = {
	"ML Logistic Regression": predict_ml(text, ml1_model, ml1_vectorizer),
	"ML Naive Bayes": predict_ml(text, ml2_model, ml2_vectorizer),
	"DL Model: RNN-LSTM": predict_dl(text, dl1_model, model_type="lstm"),
	"DL Model: CNN": predict_dl(text, dl2_model, model_type="cnn"),
	"Transformer: BERTIC": predict_transformer(text, trans1_tokenizer, trans1_model),
	"Transformer: CROSLO": predict_transformer(text, trans2_tokenizer, trans2_model),
	}
	return results

	# ========== UI with Gradio ==========
	#print(predict_dl("Ovo je najbolji film koji sam ikada gledao!", dl2_model, model_type="cnn"))
	#print(predict_dl("Užasno dosadno iskustvo.", dl2_model, model_type="cnn"))

	def gradio_interface(text):
	results = analyze_sentiment(text)
	return [
	results["ML Logistic Regression"],
	results["ML Naive Bayes"],
	results["DL Model: RNN-LSTM"],
	results["DL Model: CNN"],
	results["Transformer: BERTIC"],
	results["Transformer: CROSLO"]
	]

	with gr.Blocks(css="""
	.gr-button-primary {
	background-color: orange !important;
	border: none !important;
	color: white !important;
	}
	#title, .section-title {
	text-align: center;
	}
	.gr-textbox {
	border: 1px solid white !important;
	}
	""") as demo:
	gr.Markdown(
	"""
	# Sentiment Analysis Demo / Analiza Sentimenta
	Enter a sentence in Croatian to see how 6 different models classify its sentiment: positive, neutral, or negative.
	Unesite rečenicu na hrvatskom jeziku da biste vidjeli kako je klasificiraju 6 različitih modela: pozitivno, neutralno ili negativno.
	""",
	elem_id="title"
	)

	with gr.Row():
	with gr.Column(scale=1):
	text_input = gr.Textbox(
	label="Enter a sentence / Unesite rečenicu",
	placeholder="Npr. Ovo je najbolji film ikad!",
	lines=2
	)

	submit_btn = gr.Button("Analyze / Analiziraj", variant="primary")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Machine Learning", elem_classes=["section-title"])
	ml1_out = gr.Textbox(label="Logistic Regression")
	ml2_out = gr.Textbox(label="Naive Bayes")

	with gr.Column():
	gr.Markdown("### Deep Learning", elem_classes=["section-title"])
	dl1_out = gr.Textbox(label="RNN-LSTM")
	dl2_out = gr.Textbox(label="CNN")

	with gr.Column():
	gr.Markdown("### Transformers", elem_classes=["section-title"])
	trans1_out = gr.Textbox(label="BERTić")
	trans2_out = gr.Textbox(label="CroSloBERTa")

	submit_btn.click(
	gradio_interface,
	inputs=text_input,
	outputs=[ml1_out, ml2_out, dl1_out, dl2_out, trans1_out, trans2_out]
	)

	demo.launch()