Upload 4 files

Browse files

Files changed (4) hide show

app.py +106 -0
logreg_model.joblib +3 -0
requirements.txt +5 -0
vectorizer.joblib +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# -*- coding: utf-8 -*-
+"""Untitled0.ipynb
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/15OuC8fZC7qJeYm_3rj7H660aWfEVOx1J
+"""
+import pandas as pd
+import re
+import string
+from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
+from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
+from transformers import pipeline
+import gradio as gr
+import joblib
+# ------------- Preprocessing Functions -------------
+normalization_dict = {
+    "gk": "tidak",
+    "ga": "tidak",
+    "bgt": "banget",
+    "sm": "sama"
+}
+stop_factory = StopWordRemoverFactory()
+stop_words = set(stop_factory.get_stop_words())
+stem_factory = StemmerFactory()
+stemmer = stem_factory.create_stemmer()
+def cleaning(text):
+    text = re.sub(r'http\S+', '', text)
+    text = re.sub(r'\d+', '', text)
+    text = re.sub(rf"[{re.escape(string.punctuation)}]", ' ', text)
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+def normalize(text):
+    return ' '.join([normalization_dict.get(word, word) for word in text.split()])
+def remove_stopwords(tokens):
+    return [word for word in tokens if word not in stop_words]
+def stemming(tokens):
+    return [stemmer.stem(word) for word in tokens]
+def preprocess(text):
+    text = text.lower()
+    text = cleaning(text)
+    text = normalize(text)
+    tokens = text.split()
+    tokens = remove_stopwords(tokens)
+    tokens = stemming(tokens)
+    return tokens
+# ------------- Load Trained Model + Vectorizer -------------
+model = joblib.load("logreg_model.joblib")
+vectorizer = joblib.load("vectorizer.joblib")
+# ------------- Load Zero-Shot Classifier (still with transformers) -------------
+classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")
+labels = ["positif", "negatif"]
+def auto_label(text):
+    try:
+        result = classifier(text, labels)
+        return result['labels'][0]
+    except Exception as e:
+        return f"Error: {str(e)}"
+# ------------- Gradio Interface -------------
+def predict_sentiment(text):
+    try:
+        tokens = preprocess(text)
+        final_text = ' '.join(tokens)
+        X_input = vectorizer.transform([final_text])
+        pred_label = model.predict(X_input)[0]
+        auto_label_pred = auto_label(text)
+        return {
+            "Tokens": tokens,
+            "Prediksi Model (LogReg)": pred_label,
+            "Prediksi Zero-Shot": auto_label_pred
+        }
+    except Exception as e:
+        return {
+            "Tokens": [],
+            "Prediksi Model (LogReg)": f"Error: {str(e)}",
+            "Prediksi Zero-Shot": f"Error: {str(e)}"
+        }
+iface = gr.Interface(
+    fn=predict_sentiment,
+    inputs=gr.Textbox(lines=4, placeholder="Masukkan teks Bahasa Indonesia di sini..."),
+    outputs=[
+        gr.Textbox(label="Tokens"),
+        gr.Label(num_top_classes=2, label="Prediksi Model Logistic Regression"),
+        gr.Label(num_top_classes=2, label="Prediksi Zero-Shot (XLM-RoBERTa)")
+    ],
+    title="Sentiment Analysis Indonesia",
+    description="Preprocessing + TF-IDF + Logistic Regression + Zero-Shot Classification",
+    allow_flagging="never"
+)
+if __name__ == "__main__":
+    iface.launch()

logreg_model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:279b01eb2861868e6b4386f937893289481064c107f86724073ba92518819cd2
+size 1167

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio==3.37.0
+scikit-learn==1.5.0
+nltk==3.8.1
+numpy==1.26.4
+Sastrawi==1.0.1

vectorizer.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06fb306c175ccb1f1dd3326accaca3df78513572c4477a2ed541f7e96f6aaf04
+size 1169