Upload 4 files
Browse files- app.py +106 -0
- logreg_model.joblib +3 -0
- requirements.txt +5 -0
- vectorizer.joblib +3 -0
app.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Untitled0.ipynb
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/drive/15OuC8fZC7qJeYm_3rj7H660aWfEVOx1J
|
8 |
+
"""
|
9 |
+
|
10 |
+
import pandas as pd
|
11 |
+
import re
|
12 |
+
import string
|
13 |
+
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
|
14 |
+
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
|
15 |
+
from transformers import pipeline
|
16 |
+
import gradio as gr
|
17 |
+
import joblib
|
18 |
+
|
19 |
+
# ------------- Preprocessing Functions -------------
|
20 |
+
normalization_dict = {
|
21 |
+
"gk": "tidak",
|
22 |
+
"ga": "tidak",
|
23 |
+
"bgt": "banget",
|
24 |
+
"sm": "sama"
|
25 |
+
}
|
26 |
+
|
27 |
+
stop_factory = StopWordRemoverFactory()
|
28 |
+
stop_words = set(stop_factory.get_stop_words())
|
29 |
+
stem_factory = StemmerFactory()
|
30 |
+
stemmer = stem_factory.create_stemmer()
|
31 |
+
|
32 |
+
def cleaning(text):
|
33 |
+
text = re.sub(r'http\S+', '', text)
|
34 |
+
text = re.sub(r'\d+', '', text)
|
35 |
+
text = re.sub(rf"[{re.escape(string.punctuation)}]", ' ', text)
|
36 |
+
text = re.sub(r'\s+', ' ', text)
|
37 |
+
return text.strip()
|
38 |
+
|
39 |
+
def normalize(text):
|
40 |
+
return ' '.join([normalization_dict.get(word, word) for word in text.split()])
|
41 |
+
|
42 |
+
def remove_stopwords(tokens):
|
43 |
+
return [word for word in tokens if word not in stop_words]
|
44 |
+
|
45 |
+
def stemming(tokens):
|
46 |
+
return [stemmer.stem(word) for word in tokens]
|
47 |
+
|
48 |
+
def preprocess(text):
|
49 |
+
text = text.lower()
|
50 |
+
text = cleaning(text)
|
51 |
+
text = normalize(text)
|
52 |
+
tokens = text.split()
|
53 |
+
tokens = remove_stopwords(tokens)
|
54 |
+
tokens = stemming(tokens)
|
55 |
+
return tokens
|
56 |
+
|
57 |
+
# ------------- Load Trained Model + Vectorizer -------------
|
58 |
+
model = joblib.load("logreg_model.joblib")
|
59 |
+
vectorizer = joblib.load("vectorizer.joblib")
|
60 |
+
|
61 |
+
# ------------- Load Zero-Shot Classifier (still with transformers) -------------
|
62 |
+
classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")
|
63 |
+
labels = ["positif", "negatif"]
|
64 |
+
|
65 |
+
def auto_label(text):
|
66 |
+
try:
|
67 |
+
result = classifier(text, labels)
|
68 |
+
return result['labels'][0]
|
69 |
+
except Exception as e:
|
70 |
+
return f"Error: {str(e)}"
|
71 |
+
|
72 |
+
# ------------- Gradio Interface -------------
|
73 |
+
def predict_sentiment(text):
|
74 |
+
try:
|
75 |
+
tokens = preprocess(text)
|
76 |
+
final_text = ' '.join(tokens)
|
77 |
+
X_input = vectorizer.transform([final_text])
|
78 |
+
pred_label = model.predict(X_input)[0]
|
79 |
+
auto_label_pred = auto_label(text)
|
80 |
+
return {
|
81 |
+
"Tokens": tokens,
|
82 |
+
"Prediksi Model (LogReg)": pred_label,
|
83 |
+
"Prediksi Zero-Shot": auto_label_pred
|
84 |
+
}
|
85 |
+
except Exception as e:
|
86 |
+
return {
|
87 |
+
"Tokens": [],
|
88 |
+
"Prediksi Model (LogReg)": f"Error: {str(e)}",
|
89 |
+
"Prediksi Zero-Shot": f"Error: {str(e)}"
|
90 |
+
}
|
91 |
+
|
92 |
+
iface = gr.Interface(
|
93 |
+
fn=predict_sentiment,
|
94 |
+
inputs=gr.Textbox(lines=4, placeholder="Masukkan teks Bahasa Indonesia di sini..."),
|
95 |
+
outputs=[
|
96 |
+
gr.Textbox(label="Tokens"),
|
97 |
+
gr.Label(num_top_classes=2, label="Prediksi Model Logistic Regression"),
|
98 |
+
gr.Label(num_top_classes=2, label="Prediksi Zero-Shot (XLM-RoBERTa)")
|
99 |
+
],
|
100 |
+
title="Sentiment Analysis Indonesia",
|
101 |
+
description="Preprocessing + TF-IDF + Logistic Regression + Zero-Shot Classification",
|
102 |
+
allow_flagging="never"
|
103 |
+
)
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
iface.launch()
|
logreg_model.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:279b01eb2861868e6b4386f937893289481064c107f86724073ba92518819cd2
|
3 |
+
size 1167
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==3.37.0
|
2 |
+
scikit-learn==1.5.0
|
3 |
+
nltk==3.8.1
|
4 |
+
numpy==1.26.4
|
5 |
+
Sastrawi==1.0.1
|
vectorizer.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:06fb306c175ccb1f1dd3326accaca3df78513572c4477a2ed541f7e96f6aaf04
|
3 |
+
size 1169
|