dylanjoviane commited on
Commit
9ff51b7
·
verified ·
1 Parent(s): 77cb3b6

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +106 -0
  2. logreg_model.joblib +3 -0
  3. requirements.txt +5 -0
  4. vectorizer.joblib +3 -0
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Untitled0.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/15OuC8fZC7qJeYm_3rj7H660aWfEVOx1J
8
+ """
9
+
10
+ import pandas as pd
11
+ import re
12
+ import string
13
+ from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
14
+ from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
15
+ from transformers import pipeline
16
+ import gradio as gr
17
+ import joblib
18
+
19
+ # ------------- Preprocessing Functions -------------
20
+ normalization_dict = {
21
+ "gk": "tidak",
22
+ "ga": "tidak",
23
+ "bgt": "banget",
24
+ "sm": "sama"
25
+ }
26
+
27
+ stop_factory = StopWordRemoverFactory()
28
+ stop_words = set(stop_factory.get_stop_words())
29
+ stem_factory = StemmerFactory()
30
+ stemmer = stem_factory.create_stemmer()
31
+
32
+ def cleaning(text):
33
+ text = re.sub(r'http\S+', '', text)
34
+ text = re.sub(r'\d+', '', text)
35
+ text = re.sub(rf"[{re.escape(string.punctuation)}]", ' ', text)
36
+ text = re.sub(r'\s+', ' ', text)
37
+ return text.strip()
38
+
39
+ def normalize(text):
40
+ return ' '.join([normalization_dict.get(word, word) for word in text.split()])
41
+
42
+ def remove_stopwords(tokens):
43
+ return [word for word in tokens if word not in stop_words]
44
+
45
+ def stemming(tokens):
46
+ return [stemmer.stem(word) for word in tokens]
47
+
48
+ def preprocess(text):
49
+ text = text.lower()
50
+ text = cleaning(text)
51
+ text = normalize(text)
52
+ tokens = text.split()
53
+ tokens = remove_stopwords(tokens)
54
+ tokens = stemming(tokens)
55
+ return tokens
56
+
57
+ # ------------- Load Trained Model + Vectorizer -------------
58
+ model = joblib.load("logreg_model.joblib")
59
+ vectorizer = joblib.load("vectorizer.joblib")
60
+
61
+ # ------------- Load Zero-Shot Classifier (still with transformers) -------------
62
+ classifier = pipeline("zero-shot-classification", model="joeddav/xlm-roberta-large-xnli")
63
+ labels = ["positif", "negatif"]
64
+
65
+ def auto_label(text):
66
+ try:
67
+ result = classifier(text, labels)
68
+ return result['labels'][0]
69
+ except Exception as e:
70
+ return f"Error: {str(e)}"
71
+
72
+ # ------------- Gradio Interface -------------
73
+ def predict_sentiment(text):
74
+ try:
75
+ tokens = preprocess(text)
76
+ final_text = ' '.join(tokens)
77
+ X_input = vectorizer.transform([final_text])
78
+ pred_label = model.predict(X_input)[0]
79
+ auto_label_pred = auto_label(text)
80
+ return {
81
+ "Tokens": tokens,
82
+ "Prediksi Model (LogReg)": pred_label,
83
+ "Prediksi Zero-Shot": auto_label_pred
84
+ }
85
+ except Exception as e:
86
+ return {
87
+ "Tokens": [],
88
+ "Prediksi Model (LogReg)": f"Error: {str(e)}",
89
+ "Prediksi Zero-Shot": f"Error: {str(e)}"
90
+ }
91
+
92
+ iface = gr.Interface(
93
+ fn=predict_sentiment,
94
+ inputs=gr.Textbox(lines=4, placeholder="Masukkan teks Bahasa Indonesia di sini..."),
95
+ outputs=[
96
+ gr.Textbox(label="Tokens"),
97
+ gr.Label(num_top_classes=2, label="Prediksi Model Logistic Regression"),
98
+ gr.Label(num_top_classes=2, label="Prediksi Zero-Shot (XLM-RoBERTa)")
99
+ ],
100
+ title="Sentiment Analysis Indonesia",
101
+ description="Preprocessing + TF-IDF + Logistic Regression + Zero-Shot Classification",
102
+ allow_flagging="never"
103
+ )
104
+
105
+ if __name__ == "__main__":
106
+ iface.launch()
logreg_model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:279b01eb2861868e6b4386f937893289481064c107f86724073ba92518819cd2
3
+ size 1167
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==3.37.0
2
+ scikit-learn==1.5.0
3
+ nltk==3.8.1
4
+ numpy==1.26.4
5
+ Sastrawi==1.0.1
vectorizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06fb306c175ccb1f1dd3326accaca3df78513572c4477a2ed541f7e96f6aaf04
3
+ size 1169